mirror of https://github.com/leafspark/AutoGGUF
add lora convert feature
This commit is contained in:
parent
f12bd1efec
commit
5843b51c0c
297
src/AutoGGUF.py
297
src/AutoGGUF.py
|
@ -326,6 +326,95 @@ def __init__(self):
|
||||||
main_widget.setLayout(main_layout)
|
main_widget.setLayout(main_layout)
|
||||||
self.setCentralWidget(main_widget)
|
self.setCentralWidget(main_widget)
|
||||||
|
|
||||||
|
# LoRA Conversion Section
|
||||||
|
lora_group = QGroupBox(LORA_CONVERSION)
|
||||||
|
lora_layout = QFormLayout()
|
||||||
|
|
||||||
|
self.lora_input = QLineEdit()
|
||||||
|
lora_input_button = QPushButton(BROWSE)
|
||||||
|
lora_input_button.clicked.connect(self.browse_lora_input)
|
||||||
|
lora_input_layout = QHBoxLayout()
|
||||||
|
lora_input_layout.addWidget(self.lora_input)
|
||||||
|
lora_input_layout.addWidget(lora_input_button)
|
||||||
|
lora_layout.addRow(self.create_label(LORA_INPUT_PATH, SELECT_LORA_INPUT_DIRECTORY), lora_input_layout)
|
||||||
|
|
||||||
|
self.lora_output = QLineEdit()
|
||||||
|
lora_output_button = QPushButton(BROWSE)
|
||||||
|
lora_output_button.clicked.connect(self.browse_lora_output)
|
||||||
|
lora_output_layout = QHBoxLayout()
|
||||||
|
lora_output_layout.addWidget(self.lora_output)
|
||||||
|
lora_output_layout.addWidget(lora_output_button)
|
||||||
|
lora_layout.addRow(self.create_label(LORA_OUTPUT_PATH, SELECT_LORA_OUTPUT_FILE), lora_output_layout)
|
||||||
|
|
||||||
|
# Output Type Dropdown
|
||||||
|
self.lora_output_type_combo = QComboBox()
|
||||||
|
self.lora_output_type_combo.addItems(["GGML", "GGUF"])
|
||||||
|
self.lora_output_type_combo.currentIndexChanged.connect(self.update_base_model_visibility) # Connect to update visibility
|
||||||
|
lora_layout.addRow(self.create_label(OUTPUT_TYPE, SELECT_OUTPUT_TYPE), self.lora_output_type_combo)
|
||||||
|
|
||||||
|
# Base Model Path (initially hidden)
|
||||||
|
self.base_model_path = QLineEdit()
|
||||||
|
base_model_button = QPushButton(BROWSE)
|
||||||
|
base_model_button.clicked.connect(self.browse_base_model)
|
||||||
|
base_model_layout = QHBoxLayout()
|
||||||
|
base_model_layout.addWidget(self.base_model_path)
|
||||||
|
base_model_layout.addWidget(base_model_button)
|
||||||
|
self.base_model_widget = QWidget()
|
||||||
|
self.base_model_widget.setLayout(base_model_layout)
|
||||||
|
self.base_model_widget.setVisible(False) # Initially hidden
|
||||||
|
lora_layout.addRow(self.create_label(BASE_MODEL, SELECT_BASE_MODEL_FILE), self.base_model_widget)
|
||||||
|
|
||||||
|
lora_convert_button = QPushButton(CONVERT_LORA)
|
||||||
|
lora_convert_button.clicked.connect(self.convert_lora)
|
||||||
|
lora_layout.addRow(lora_convert_button)
|
||||||
|
|
||||||
|
lora_group.setLayout(lora_layout)
|
||||||
|
right_layout.addWidget(lora_group)
|
||||||
|
|
||||||
|
# Export LoRA
|
||||||
|
export_lora_group = QGroupBox(EXPORT_LORA)
|
||||||
|
export_lora_layout = QFormLayout()
|
||||||
|
|
||||||
|
self.export_lora_model = QLineEdit()
|
||||||
|
export_lora_model_button = QPushButton(BROWSE)
|
||||||
|
export_lora_model_button.clicked.connect(self.browse_export_lora_model)
|
||||||
|
export_lora_model_layout = QHBoxLayout()
|
||||||
|
export_lora_model_layout.addWidget(self.export_lora_model)
|
||||||
|
export_lora_model_layout.addWidget(export_lora_model_button)
|
||||||
|
export_lora_layout.addRow(self.create_label(MODEL, SELECT_MODEL_FILE), export_lora_model_layout)
|
||||||
|
|
||||||
|
self.export_lora_output = QLineEdit()
|
||||||
|
export_lora_output_button = QPushButton(BROWSE)
|
||||||
|
export_lora_output_button.clicked.connect(self.browse_export_lora_output)
|
||||||
|
export_lora_output_layout = QHBoxLayout()
|
||||||
|
export_lora_output_layout.addWidget(self.export_lora_output)
|
||||||
|
export_lora_output_layout.addWidget(export_lora_output_button)
|
||||||
|
export_lora_layout.addRow(self.create_label(OUTPUT, SELECT_OUTPUT_FILE), export_lora_output_layout)
|
||||||
|
|
||||||
|
# GGML LoRA Adapters
|
||||||
|
self.export_lora_adapters = QListWidget()
|
||||||
|
add_adapter_button = QPushButton(ADD_ADAPTER)
|
||||||
|
add_adapter_button.clicked.connect(self.add_lora_adapter)
|
||||||
|
adapters_layout = QVBoxLayout()
|
||||||
|
adapters_layout.addWidget(self.export_lora_adapters)
|
||||||
|
buttons_layout = QHBoxLayout()
|
||||||
|
buttons_layout.addWidget(add_adapter_button)
|
||||||
|
adapters_layout.addLayout(buttons_layout)
|
||||||
|
export_lora_layout.addRow(self.create_label(GGML_LORA_ADAPTERS, SELECT_LORA_ADAPTER_FILES), adapters_layout)
|
||||||
|
|
||||||
|
# Threads
|
||||||
|
self.export_lora_threads = QSpinBox()
|
||||||
|
self.export_lora_threads.setRange(1, 64)
|
||||||
|
self.export_lora_threads.setValue(8) # Default value
|
||||||
|
export_lora_layout.addRow(self.create_label(THREADS, NUMBER_OF_THREADS_FOR_LORA_EXPORT), self.export_lora_threads)
|
||||||
|
|
||||||
|
export_lora_button = QPushButton(EXPORT_LORA)
|
||||||
|
export_lora_button.clicked.connect(self.export_lora)
|
||||||
|
export_lora_layout.addRow(export_lora_button)
|
||||||
|
|
||||||
|
export_lora_group.setLayout(export_lora_layout)
|
||||||
|
right_layout.addWidget(export_lora_group) # Add the Export LoRA group to the right layout
|
||||||
|
|
||||||
# Modify the task list to support right-click menu
|
# Modify the task list to support right-click menu
|
||||||
self.task_list.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
|
self.task_list.setContextMenuPolicy(Qt.ContextMenuPolicy.CustomContextMenu)
|
||||||
self.task_list.customContextMenuRequested.connect(self.show_task_context_menu)
|
self.task_list.customContextMenuRequested.connect(self.show_task_context_menu)
|
||||||
|
@ -361,6 +450,9 @@ def refresh_backends(self):
|
||||||
self.backend_combo.addItem(NO_BACKENDS_AVAILABLE)
|
self.backend_combo.addItem(NO_BACKENDS_AVAILABLE)
|
||||||
self.backend_combo.setEnabled(False)
|
self.backend_combo.setEnabled(False)
|
||||||
self.logger.info(FOUND_VALID_BACKENDS.format(self.backend_combo.count()))
|
self.logger.info(FOUND_VALID_BACKENDS.format(self.backend_combo.count()))
|
||||||
|
|
||||||
|
def update_base_model_visibility(self, index):
|
||||||
|
self.base_model_widget.setVisible(self.lora_output_type_combo.itemText(index) == "GGUF")
|
||||||
|
|
||||||
def save_preset(self):
|
def save_preset(self):
|
||||||
self.logger.info(SAVING_PRESET)
|
self.logger.info(SAVING_PRESET)
|
||||||
|
@ -437,6 +529,128 @@ def save_task_preset(self, task_item):
|
||||||
QMessageBox.information(self, TASK_PRESET_SAVED, TASK_PRESET_SAVED_TO.format(file_name))
|
QMessageBox.information(self, TASK_PRESET_SAVED, TASK_PRESET_SAVED_TO.format(file_name))
|
||||||
break
|
break
|
||||||
|
|
||||||
|
def browse_export_lora_model(self):
|
||||||
|
self.logger.info(BROWSING_FOR_EXPORT_LORA_MODEL_FILE)
|
||||||
|
model_file, _ = QFileDialog.getOpenFileName(self, SELECT_MODEL_FILE, "", GGUF_FILES)
|
||||||
|
if model_file:
|
||||||
|
self.export_lora_model.setText(os.path.abspath(model_file))
|
||||||
|
|
||||||
|
def browse_export_lora_output(self):
|
||||||
|
self.logger.info(BROWSING_FOR_EXPORT_LORA_OUTPUT_FILE)
|
||||||
|
output_file, _ = QFileDialog.getSaveFileName(self, SELECT_OUTPUT_FILE, "", GGUF_FILES)
|
||||||
|
if output_file:
|
||||||
|
self.export_lora_output.setText(os.path.abspath(output_file))
|
||||||
|
|
||||||
|
def add_lora_adapter(self):
|
||||||
|
self.logger.info(ADDING_LORA_ADAPTER)
|
||||||
|
adapter_path, _ = QFileDialog.getOpenFileName(self, SELECT_LORA_ADAPTER_FILE, "", LORA_FILES)
|
||||||
|
if adapter_path:
|
||||||
|
# Create a widget to hold the path and scale input
|
||||||
|
adapter_widget = QWidget()
|
||||||
|
adapter_layout = QHBoxLayout(adapter_widget)
|
||||||
|
|
||||||
|
path_input = QLineEdit(adapter_path)
|
||||||
|
path_input.setReadOnly(True)
|
||||||
|
adapter_layout.addWidget(path_input)
|
||||||
|
|
||||||
|
scale_input = QLineEdit("1.0") # Default scale value
|
||||||
|
adapter_layout.addWidget(scale_input)
|
||||||
|
|
||||||
|
delete_button = QPushButton(DELETE_ADAPTER)
|
||||||
|
delete_button.clicked.connect(lambda: self.delete_lora_adapter_item(adapter_widget))
|
||||||
|
adapter_layout.addWidget(delete_button)
|
||||||
|
|
||||||
|
# Add the widget to the list
|
||||||
|
list_item = QListWidgetItem(self.export_lora_adapters)
|
||||||
|
list_item.setSizeHint(adapter_widget.sizeHint())
|
||||||
|
self.export_lora_adapters.addItem(list_item)
|
||||||
|
self.export_lora_adapters.setItemWidget(list_item, adapter_widget)
|
||||||
|
|
||||||
|
def browse_base_model(self):
|
||||||
|
self.logger.info(BROWSING_FOR_BASE_MODEL_FOLDER) # Updated log message
|
||||||
|
base_model_folder = QFileDialog.getExistingDirectory(self, SELECT_BASE_MODEL_FOLDER)
|
||||||
|
if base_model_folder:
|
||||||
|
self.base_model_path.setText(os.path.abspath(base_model_folder))
|
||||||
|
|
||||||
|
def delete_lora_adapter_item(self, adapter_widget):
|
||||||
|
self.logger.info(DELETING_LORA_ADAPTER)
|
||||||
|
# Find the QListWidgetItem containing the adapter_widget
|
||||||
|
for i in range(self.export_lora_adapters.count()):
|
||||||
|
item = self.export_lora_adapters.item(i)
|
||||||
|
if self.export_lora_adapters.itemWidget(item) == adapter_widget:
|
||||||
|
self.export_lora_adapters.takeItem(i) # Remove the item
|
||||||
|
break
|
||||||
|
|
||||||
|
def export_lora(self):
|
||||||
|
self.logger.info(STARTING_LORA_EXPORT)
|
||||||
|
try:
|
||||||
|
model_path = self.export_lora_model.text()
|
||||||
|
output_path = self.export_lora_output.text()
|
||||||
|
lora_adapters = []
|
||||||
|
|
||||||
|
for i in range(self.export_lora_adapters.count()):
|
||||||
|
item = self.export_lora_adapters.item(i)
|
||||||
|
adapter_widget = self.export_lora_adapters.itemWidget(item)
|
||||||
|
path_input = adapter_widget.layout().itemAt(0).widget()
|
||||||
|
scale_input = adapter_widget.layout().itemAt(1).widget()
|
||||||
|
adapter_path = path_input.text()
|
||||||
|
adapter_scale = scale_input.text()
|
||||||
|
lora_adapters.append((adapter_path, adapter_scale))
|
||||||
|
|
||||||
|
if not model_path:
|
||||||
|
raise ValueError(MODEL_PATH_REQUIRED)
|
||||||
|
if not output_path:
|
||||||
|
raise ValueError(OUTPUT_PATH_REQUIRED)
|
||||||
|
if not lora_adapters:
|
||||||
|
raise ValueError(AT_LEAST_ONE_LORA_ADAPTER_REQUIRED)
|
||||||
|
|
||||||
|
backend_path = self.backend_combo.currentData()
|
||||||
|
if not backend_path:
|
||||||
|
raise ValueError(NO_BACKEND_SELECTED)
|
||||||
|
|
||||||
|
command = [os.path.join(backend_path, "llama-export-lora"),
|
||||||
|
"--model", model_path,
|
||||||
|
"--output", output_path]
|
||||||
|
|
||||||
|
for adapter_path, adapter_scale in lora_adapters:
|
||||||
|
if adapter_path:
|
||||||
|
if adapter_scale:
|
||||||
|
try:
|
||||||
|
scale_value = float(adapter_scale)
|
||||||
|
command.extend(["--lora-scaled", adapter_path, str(scale_value)])
|
||||||
|
except ValueError:
|
||||||
|
raise ValueError(INVALID_LORA_SCALE_VALUE)
|
||||||
|
else:
|
||||||
|
command.extend(["--lora", adapter_path])
|
||||||
|
|
||||||
|
threads = self.export_lora_threads.value()
|
||||||
|
command.extend(["--threads", str(threads)])
|
||||||
|
|
||||||
|
logs_path = self.logs_input.text()
|
||||||
|
ensure_directory(logs_path)
|
||||||
|
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
log_file = os.path.join(logs_path, f"lora_export_{timestamp}.log")
|
||||||
|
|
||||||
|
thread = QuantizationThread(command, backend_path, log_file)
|
||||||
|
self.quant_threads.append(thread)
|
||||||
|
|
||||||
|
task_item = TaskListItem(EXPORTING_LORA, log_file)
|
||||||
|
list_item = QListWidgetItem(self.task_list)
|
||||||
|
list_item.setSizeHint(task_item.sizeHint())
|
||||||
|
self.task_list.addItem(list_item)
|
||||||
|
self.task_list.setItemWidget(list_item, task_item)
|
||||||
|
|
||||||
|
thread.status_signal.connect(task_item.update_status)
|
||||||
|
thread.finished_signal.connect(lambda: self.task_finished(thread))
|
||||||
|
thread.error_signal.connect(lambda err: self.handle_error(err, task_item))
|
||||||
|
thread.start()
|
||||||
|
self.logger.info(LORA_EXPORT_TASK_STARTED)
|
||||||
|
except ValueError as e:
|
||||||
|
self.show_error(str(e))
|
||||||
|
except Exception as e:
|
||||||
|
self.show_error(ERROR_STARTING_LORA_EXPORT.format(str(e)))
|
||||||
|
|
||||||
def restart_task(self, task_item):
|
def restart_task(self, task_item):
|
||||||
self.logger.info(RESTARTING_TASK.format(task_item.task_name))
|
self.logger.info(RESTARTING_TASK.format(task_item.task_name))
|
||||||
for thread in self.quant_threads:
|
for thread in self.quant_threads:
|
||||||
|
@ -451,6 +665,82 @@ def restart_task(self, task_item):
|
||||||
task_item.update_status(IN_PROGRESS)
|
task_item.update_status(IN_PROGRESS)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
def browse_lora_input(self):
|
||||||
|
self.logger.info(BROWSING_FOR_LORA_INPUT_DIRECTORY)
|
||||||
|
lora_input_path = QFileDialog.getExistingDirectory(self, SELECT_LORA_INPUT_DIRECTORY)
|
||||||
|
if lora_input_path:
|
||||||
|
self.lora_input.setText(os.path.abspath(lora_input_path))
|
||||||
|
ensure_directory(lora_input_path)
|
||||||
|
|
||||||
|
def browse_lora_output(self):
|
||||||
|
self.logger.info(BROWSING_FOR_LORA_OUTPUT_FILE)
|
||||||
|
lora_output_file, _ = QFileDialog.getSaveFileName(self, SELECT_LORA_OUTPUT_FILE, "", GGUF_AND_BIN_FILES)
|
||||||
|
if lora_output_file:
|
||||||
|
self.lora_output.setText(os.path.abspath(lora_output_file))
|
||||||
|
|
||||||
|
def convert_lora(self):
|
||||||
|
self.logger.info(STARTING_LORA_CONVERSION)
|
||||||
|
try:
|
||||||
|
lora_input_path = self.lora_input.text()
|
||||||
|
lora_output_path = self.lora_output.text()
|
||||||
|
lora_output_type = self.lora_output_type_combo.currentText()
|
||||||
|
|
||||||
|
if not lora_input_path:
|
||||||
|
raise ValueError(LORA_INPUT_PATH_REQUIRED)
|
||||||
|
if not lora_output_path:
|
||||||
|
raise ValueError(LORA_OUTPUT_PATH_REQUIRED)
|
||||||
|
|
||||||
|
if lora_output_type == "GGUF": # Use new file and parameters for GGUF
|
||||||
|
command = ["python", "src/convert_lora_to_gguf.py", "--outfile", lora_output_path, lora_input_path]
|
||||||
|
base_model_path = self.base_model_path.text()
|
||||||
|
if not base_model_path:
|
||||||
|
raise ValueError(BASE_MODEL_PATH_REQUIRED)
|
||||||
|
command.extend(["--base", base_model_path])
|
||||||
|
else: # Use old GGML parameters for GGML
|
||||||
|
command = ["python", "src/convert_lora_to_ggml.py", lora_input_path]
|
||||||
|
|
||||||
|
logs_path = self.logs_input.text()
|
||||||
|
ensure_directory(logs_path)
|
||||||
|
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
log_file = os.path.join(logs_path, f"lora_conversion_{timestamp}.log")
|
||||||
|
|
||||||
|
thread = QuantizationThread(command, os.getcwd(), log_file)
|
||||||
|
self.quant_threads.append(thread)
|
||||||
|
|
||||||
|
task_name = LORA_CONVERSION_FROM_TO.format(os.path.basename(lora_input_path), os.path.basename(lora_output_path))
|
||||||
|
task_item = TaskListItem(task_name, log_file)
|
||||||
|
list_item = QListWidgetItem(self.task_list)
|
||||||
|
list_item.setSizeHint(task_item.sizeHint())
|
||||||
|
self.task_list.addItem(list_item)
|
||||||
|
self.task_list.setItemWidget(list_item, task_item)
|
||||||
|
|
||||||
|
thread.status_signal.connect(task_item.update_status)
|
||||||
|
thread.finished_signal.connect(lambda: self.lora_conversion_finished(thread, lora_input_path, lora_output_path))
|
||||||
|
thread.error_signal.connect(lambda err: self.handle_error(err, task_item))
|
||||||
|
thread.start()
|
||||||
|
self.logger.info(LORA_CONVERSION_TASK_STARTED)
|
||||||
|
except ValueError as e:
|
||||||
|
self.show_error(str(e))
|
||||||
|
except Exception as e:
|
||||||
|
self.show_error(ERROR_STARTING_LORA_CONVERSION.format(str(e)))
|
||||||
|
|
||||||
|
def lora_conversion_finished(self, thread, input_path, output_path):
|
||||||
|
self.logger.info(LORA_CONVERSION_FINISHED)
|
||||||
|
if thread in self.quant_threads:
|
||||||
|
self.quant_threads.remove(thread)
|
||||||
|
try:
|
||||||
|
# Only move the file if the output type is GGML
|
||||||
|
if self.lora_output_type_combo.currentText() == "GGML":
|
||||||
|
source_file = os.path.join(input_path, "ggml-adapter-model.bin")
|
||||||
|
if os.path.exists(source_file):
|
||||||
|
shutil.move(source_file, output_path)
|
||||||
|
self.logger.info(LORA_FILE_MOVED.format(source_file, output_path))
|
||||||
|
else:
|
||||||
|
self.logger.warning(LORA_FILE_NOT_FOUND.format(source_file))
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(ERROR_MOVING_LORA_FILE.format(str(e)))
|
||||||
|
|
||||||
def download_finished(self, extract_dir):
|
def download_finished(self, extract_dir):
|
||||||
self.logger.info(DOWNLOAD_FINISHED_EXTRACTED_TO.format(extract_dir))
|
self.logger.info(DOWNLOAD_FINISHED_EXTRACTED_TO.format(extract_dir))
|
||||||
self.download_button.setEnabled(True)
|
self.download_button.setEnabled(True)
|
||||||
|
@ -945,6 +1235,10 @@ def generate_imatrix(self):
|
||||||
if not os.path.exists(backend_path):
|
if not os.path.exists(backend_path):
|
||||||
raise FileNotFoundError(BACKEND_PATH_NOT_EXIST.format(backend_path))
|
raise FileNotFoundError(BACKEND_PATH_NOT_EXIST.format(backend_path))
|
||||||
|
|
||||||
|
# Check if the Model area is empty
|
||||||
|
if not self.imatrix_model.text():
|
||||||
|
raise ValueError(MODEL_PATH_REQUIRED_FOR_IMATRIX)
|
||||||
|
|
||||||
command = [
|
command = [
|
||||||
os.path.join(backend_path, "llama-imatrix"),
|
os.path.join(backend_path, "llama-imatrix"),
|
||||||
"-f", self.imatrix_datafile.text(),
|
"-f", self.imatrix_datafile.text(),
|
||||||
|
@ -966,7 +1260,8 @@ def generate_imatrix(self):
|
||||||
thread = QuantizationThread(command, backend_path, log_file)
|
thread = QuantizationThread(command, backend_path, log_file)
|
||||||
self.quant_threads.append(thread)
|
self.quant_threads.append(thread)
|
||||||
|
|
||||||
task_item = TaskListItem(GENERATING_IMATRIX, log_file)
|
task_name = GENERATING_IMATRIX_FOR.format(os.path.basename(self.imatrix_model.text()))
|
||||||
|
task_item = TaskListItem(task_name, log_file)
|
||||||
list_item = QListWidgetItem(self.task_list)
|
list_item = QListWidgetItem(self.task_list)
|
||||||
list_item.setSizeHint(task_item.sizeHint())
|
list_item.setSizeHint(task_item.sizeHint())
|
||||||
self.task_list.addItem(list_item)
|
self.task_list.addItem(list_item)
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,153 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, BinaryIO, Sequence
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
logger = logging.getLogger("lora-to-gguf")
|
||||||
|
|
||||||
|
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||||
|
|
||||||
|
|
||||||
|
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
||||||
|
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
||||||
|
fout.write(struct.pack("i", 1)) # file version
|
||||||
|
fout.write(struct.pack("i", params["r"]))
|
||||||
|
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
|
||||||
|
# but some models ship a float value instead
|
||||||
|
# let's convert to int, but fail if lossless conversion is not possible
|
||||||
|
assert (
|
||||||
|
int(params["lora_alpha"]) == params["lora_alpha"]
|
||||||
|
), "cannot convert float to int losslessly"
|
||||||
|
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
||||||
|
|
||||||
|
|
||||||
|
def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
|
||||||
|
sname = name.encode("utf-8")
|
||||||
|
fout.write(
|
||||||
|
struct.pack(
|
||||||
|
"iii",
|
||||||
|
len(shape),
|
||||||
|
len(sname),
|
||||||
|
NUMPY_TYPE_TO_FTYPE[data_type.name],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
||||||
|
fout.write(sname)
|
||||||
|
fout.seek((fout.tell() + 31) & -32)
|
||||||
|
|
||||||
|
def pyinstaller_include():
|
||||||
|
# PyInstaller import
|
||||||
|
pass
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
logger.info(f"Usage: python {sys.argv[0]} <path> [arch]")
|
||||||
|
logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'")
|
||||||
|
logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
||||||
|
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
||||||
|
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
||||||
|
|
||||||
|
if os.path.exists(input_model):
|
||||||
|
model = torch.load(input_model, map_location="cpu")
|
||||||
|
else:
|
||||||
|
input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
|
||||||
|
# lazy import load_file only if lora is in safetensors format.
|
||||||
|
from safetensors.torch import load_file
|
||||||
|
model = load_file(input_model, device="cpu")
|
||||||
|
|
||||||
|
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
|
||||||
|
|
||||||
|
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
|
||||||
|
logger.error(f"Error: unsupported architecture {arch_name}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
|
||||||
|
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
|
||||||
|
|
||||||
|
with open(input_json, "r") as f:
|
||||||
|
params = json.load(f)
|
||||||
|
|
||||||
|
if params["peft_type"] != "LORA":
|
||||||
|
logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if params["fan_in_fan_out"] is True:
|
||||||
|
logger.error("Error: param fan_in_fan_out is not supported")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if params["bias"] is not None and params["bias"] != "none":
|
||||||
|
logger.error("Error: param bias is not supported")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# TODO: these seem to be layers that have been trained but without lora.
|
||||||
|
# doesn't seem widely used but eventually should be supported
|
||||||
|
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
|
||||||
|
logger.error("Error: param modules_to_save is not supported")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
with open(output_path, "wb") as fout:
|
||||||
|
fout.truncate()
|
||||||
|
|
||||||
|
write_file_header(fout, params)
|
||||||
|
for k, v in model.items():
|
||||||
|
orig_k = k
|
||||||
|
if k.endswith(".default.weight"):
|
||||||
|
k = k.replace(".default.weight", ".weight")
|
||||||
|
if k in ["llama_proj.weight", "llama_proj.bias"]:
|
||||||
|
continue
|
||||||
|
if k.endswith("lora_A.weight"):
|
||||||
|
if v.dtype != torch.float16 and v.dtype != torch.float32:
|
||||||
|
v = v.float()
|
||||||
|
v = v.T
|
||||||
|
else:
|
||||||
|
v = v.float()
|
||||||
|
|
||||||
|
t = v.detach().numpy()
|
||||||
|
|
||||||
|
prefix = "base_model.model."
|
||||||
|
if k.startswith(prefix):
|
||||||
|
k = k[len(prefix) :]
|
||||||
|
|
||||||
|
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
|
||||||
|
if k.endswith(lora_suffixes):
|
||||||
|
suffix = k[-len(lora_suffixes[0]):]
|
||||||
|
k = k[: -len(lora_suffixes[0])]
|
||||||
|
else:
|
||||||
|
logger.error(f"Error: unrecognized tensor name {orig_k}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
tname = name_map.get_name(k)
|
||||||
|
if tname is None:
|
||||||
|
logger.error(f"Error: could not map tensor name {orig_k}")
|
||||||
|
logger.error(" Note: the arch parameter must be specified if the model is not llama")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if suffix == ".lora_A.weight":
|
||||||
|
tname += ".weight.loraA"
|
||||||
|
elif suffix == ".lora_B.weight":
|
||||||
|
tname += ".weight.loraB"
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
|
logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||||
|
write_tensor_header(fout, tname, t.shape, t.dtype)
|
||||||
|
t.tofile(fout)
|
||||||
|
|
||||||
|
logger.info(f"Converted {input_json} and {input_model} to {output_path}")
|
|
@ -0,0 +1,395 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
from math import prod
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from torch import Tensor
|
||||||
|
|
||||||
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
# reuse model definitions from convert_hf_to_gguf.py
|
||||||
|
from convert_hf_to_gguf import LazyTorchTensor, Model
|
||||||
|
|
||||||
|
logger = logging.getLogger("lora-to-gguf")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PartialLoraTensor:
|
||||||
|
A: Tensor | None = None
|
||||||
|
B: Tensor | None = None
|
||||||
|
|
||||||
|
# magic to support tensor shape modifications and splitting
|
||||||
|
class LoraTorchTensor:
|
||||||
|
_lora_A: Tensor # (n_rank, row_size)
|
||||||
|
_lora_B: Tensor # (col_size, n_rank)
|
||||||
|
_rank: int
|
||||||
|
|
||||||
|
def __init__(self, A: Tensor, B: Tensor):
|
||||||
|
assert len(A.shape) == len(B.shape)
|
||||||
|
assert A.shape[-2] == B.shape[-1]
|
||||||
|
if A.dtype != B.dtype:
|
||||||
|
A = A.to(torch.float32)
|
||||||
|
B = B.to(torch.float32)
|
||||||
|
self._lora_A = A
|
||||||
|
self._lora_B = B
|
||||||
|
self._rank = B.shape[-1]
|
||||||
|
|
||||||
|
def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
|
||||||
|
return (self._lora_A, self._lora_B)
|
||||||
|
|
||||||
|
def __getitem__(
|
||||||
|
self,
|
||||||
|
indices: (
|
||||||
|
SupportsIndex
|
||||||
|
| slice
|
||||||
|
| tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature
|
||||||
|
),
|
||||||
|
) -> LoraTorchTensor:
|
||||||
|
shape = self.shape
|
||||||
|
if isinstance(indices, SupportsIndex):
|
||||||
|
if len(shape) > 2:
|
||||||
|
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
||||||
|
else:
|
||||||
|
raise NotImplementedError # can't return a vector
|
||||||
|
elif isinstance(indices, slice):
|
||||||
|
if len(shape) > 2:
|
||||||
|
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
||||||
|
else:
|
||||||
|
return LoraTorchTensor(self._lora_A, self._lora_B[indices])
|
||||||
|
elif isinstance(indices, tuple):
|
||||||
|
assert len(indices) > 0
|
||||||
|
if indices[-1] is Ellipsis:
|
||||||
|
return self[indices[:-1]]
|
||||||
|
# expand ellipsis
|
||||||
|
indices = tuple(
|
||||||
|
u
|
||||||
|
for v in (
|
||||||
|
(
|
||||||
|
(slice(None, None) for _ in range(len(indices) - 1))
|
||||||
|
if i is Ellipsis
|
||||||
|
else (i,)
|
||||||
|
)
|
||||||
|
for i in indices
|
||||||
|
)
|
||||||
|
for u in v
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(indices) < len(shape):
|
||||||
|
indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
|
||||||
|
|
||||||
|
# TODO: make sure this is correct
|
||||||
|
indices_A = (
|
||||||
|
*(
|
||||||
|
(
|
||||||
|
j.__index__() % self._lora_A.shape[i]
|
||||||
|
if isinstance(j, SupportsIndex)
|
||||||
|
else slice(None, None)
|
||||||
|
)
|
||||||
|
for i, j in enumerate(indices[:-2])
|
||||||
|
),
|
||||||
|
slice(None, None),
|
||||||
|
indices[-1],
|
||||||
|
)
|
||||||
|
indices_B = indices[:-1]
|
||||||
|
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
|
||||||
|
else:
|
||||||
|
raise NotImplementedError # unknown indice type
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dtype(self) -> torch.dtype:
|
||||||
|
assert self._lora_A.dtype == self._lora_B.dtype
|
||||||
|
return self._lora_A.dtype
|
||||||
|
|
||||||
|
@property
|
||||||
|
def shape(self) -> tuple[int, ...]:
|
||||||
|
assert len(self._lora_A.shape) == len(self._lora_B.shape)
|
||||||
|
return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
|
||||||
|
|
||||||
|
def size(self, dim=None):
|
||||||
|
assert dim is None
|
||||||
|
return self.shape
|
||||||
|
|
||||||
|
def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
|
||||||
|
if isinstance(shape[0], tuple):
|
||||||
|
new_shape: tuple[int, ...] = shape[0]
|
||||||
|
else:
|
||||||
|
new_shape = cast(tuple[int, ...], shape)
|
||||||
|
orig_shape = self.shape
|
||||||
|
if len(new_shape) < 2:
|
||||||
|
raise NotImplementedError # can't become a vector
|
||||||
|
|
||||||
|
# expand -1 in the shape
|
||||||
|
if any(dim == -1 for dim in new_shape):
|
||||||
|
n_elems = prod(orig_shape)
|
||||||
|
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
|
||||||
|
assert n_elems % n_new_elems == 0
|
||||||
|
new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
|
||||||
|
|
||||||
|
if new_shape[-1] != orig_shape[-1]:
|
||||||
|
raise NotImplementedError # can't reshape the row size trivially
|
||||||
|
|
||||||
|
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
|
||||||
|
shape_B = (*new_shape[:-1], self._rank)
|
||||||
|
return LoraTorchTensor(
|
||||||
|
self._lora_A.reshape(shape_A),
|
||||||
|
self._lora_B.reshape(shape_B),
|
||||||
|
)
|
||||||
|
|
||||||
|
def reshape_as(self, other: Tensor) -> LoraTorchTensor:
|
||||||
|
return self.reshape(*other.shape)
|
||||||
|
|
||||||
|
def view(self, *size: int) -> LoraTorchTensor:
|
||||||
|
return self.reshape(*size)
|
||||||
|
|
||||||
|
def permute(self, *dims: int) -> LoraTorchTensor:
|
||||||
|
shape = self.shape
|
||||||
|
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
|
||||||
|
if dims[-1] == -1:
|
||||||
|
# TODO: support higher dimensional A shapes bigger than 1
|
||||||
|
assert all(dim == 1 for dim in self._lora_A.shape[:-2])
|
||||||
|
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
|
||||||
|
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
|
||||||
|
return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
|
||||||
|
else:
|
||||||
|
# TODO: compose the above two
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
|
||||||
|
shape = self.shape
|
||||||
|
dims = [i for i in range(len(shape))]
|
||||||
|
dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
|
||||||
|
return self.permute(*dims)
|
||||||
|
|
||||||
|
def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
|
||||||
|
return self.transpose(axis0, axis1)
|
||||||
|
|
||||||
|
def to(self, *args, **kwargs):
|
||||||
|
return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
|
||||||
|
del types # unused
|
||||||
|
|
||||||
|
if kwargs is None:
|
||||||
|
kwargs = {}
|
||||||
|
|
||||||
|
if func is torch.permute:
|
||||||
|
return type(args[0]).permute(*args, **kwargs)
|
||||||
|
elif func is torch.reshape:
|
||||||
|
return type(args[0]).reshape(*args, **kwargs)
|
||||||
|
elif func is torch.stack:
|
||||||
|
assert isinstance(args[0], Sequence)
|
||||||
|
dim = kwargs.get("dim", 0)
|
||||||
|
assert dim == 0
|
||||||
|
return LoraTorchTensor(
|
||||||
|
torch.stack([a._lora_A for a in args[0]], dim),
|
||||||
|
torch.stack([b._lora_B for b in args[0]], dim),
|
||||||
|
)
|
||||||
|
elif func is torch.cat:
|
||||||
|
assert isinstance(args[0], Sequence)
|
||||||
|
dim = kwargs.get("dim", 0)
|
||||||
|
assert dim == 0
|
||||||
|
if len(args[0][0].shape) > 2:
|
||||||
|
return LoraTorchTensor(
|
||||||
|
torch.cat([a._lora_A for a in args[0]], dim),
|
||||||
|
torch.cat([b._lora_B for b in args[0]], dim),
|
||||||
|
)
|
||||||
|
elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
|
||||||
|
return LoraTorchTensor(
|
||||||
|
args[0][0]._lora_A,
|
||||||
|
torch.cat([b._lora_B for b in args[0]], dim),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
def get_base_tensor_name(lora_tensor_name: str) -> str:
|
||||||
|
base_name = lora_tensor_name.replace("base_model.model.", "")
|
||||||
|
base_name = base_name.replace(".lora_A.weight", ".weight")
|
||||||
|
base_name = base_name.replace(".lora_B.weight", ".weight")
|
||||||
|
return base_name
|
||||||
|
|
||||||
|
def pyinstaller_include():
|
||||||
|
# PyInstaller import
|
||||||
|
pass
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
|
||||||
|
parser.add_argument(
|
||||||
|
"--outfile", type=Path,
|
||||||
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
||||||
|
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--bigendian", action="store_true",
|
||||||
|
help="model is executed on big endian machine",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-lazy", action="store_true",
|
||||||
|
help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose", action="store_true",
|
||||||
|
help="increase output verbosity",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dry-run", action="store_true",
|
||||||
|
help="only print out what will be done, without writing any new files",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--base", type=Path, required=True,
|
||||||
|
help="directory containing base model file",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"lora_path", type=Path,
|
||||||
|
help="directory containing LoRA adapter file",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = parse_args()
|
||||||
|
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||||
|
|
||||||
|
ftype_map: dict[str, gguf.LlamaFileType] = {
|
||||||
|
"f32": gguf.LlamaFileType.ALL_F32,
|
||||||
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||||
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||||
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||||
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
|
}
|
||||||
|
|
||||||
|
ftype = ftype_map[args.outtype]
|
||||||
|
|
||||||
|
dir_base_model: Path = args.base
|
||||||
|
dir_lora: Path = args.lora_path
|
||||||
|
lora_config = dir_lora / "adapter_config.json"
|
||||||
|
input_model = dir_lora / "adapter_model.safetensors"
|
||||||
|
|
||||||
|
if args.outfile is not None:
|
||||||
|
fname_out = args.outfile
|
||||||
|
else:
|
||||||
|
# output in the same directory as the model by default
|
||||||
|
fname_out = dir_lora
|
||||||
|
|
||||||
|
if os.path.exists(input_model):
|
||||||
|
# lazy import load_file only if lora is in safetensors format.
|
||||||
|
from safetensors.torch import load_file
|
||||||
|
|
||||||
|
lora_model = load_file(input_model, device="cpu")
|
||||||
|
else:
|
||||||
|
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
||||||
|
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
||||||
|
|
||||||
|
# load base model
|
||||||
|
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||||
|
hparams = Model.load_hparams(dir_base_model)
|
||||||
|
with torch.inference_mode():
|
||||||
|
try:
|
||||||
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
|
except NotImplementedError:
|
||||||
|
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
class LoraModel(model_class):
|
||||||
|
model_arch = model_class.model_arch
|
||||||
|
|
||||||
|
lora_alpha: float
|
||||||
|
|
||||||
|
def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
|
||||||
|
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
self.dir_model_card = dir_lora_model
|
||||||
|
self.lora_alpha = float(lora_alpha)
|
||||||
|
|
||||||
|
def set_type(self):
|
||||||
|
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
||||||
|
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
|
tensor_map: dict[str, PartialLoraTensor] = {}
|
||||||
|
|
||||||
|
for name, tensor in lora_model.items():
|
||||||
|
if self.lazy:
|
||||||
|
tensor = LazyTorchTensor.from_eager(tensor)
|
||||||
|
base_name = get_base_tensor_name(name)
|
||||||
|
is_lora_a = ".lora_A.weight" in name
|
||||||
|
is_lora_b = ".lora_B.weight" in name
|
||||||
|
if not is_lora_a and not is_lora_b:
|
||||||
|
if ".base_layer.weight" in name:
|
||||||
|
continue
|
||||||
|
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if base_name in tensor_map:
|
||||||
|
if is_lora_a:
|
||||||
|
tensor_map[base_name].A = tensor
|
||||||
|
else:
|
||||||
|
tensor_map[base_name].B = tensor
|
||||||
|
else:
|
||||||
|
if is_lora_a:
|
||||||
|
tensor_map[base_name] = PartialLoraTensor(A=tensor)
|
||||||
|
else:
|
||||||
|
tensor_map[base_name] = PartialLoraTensor(B=tensor)
|
||||||
|
|
||||||
|
for name, tensor in tensor_map.items():
|
||||||
|
assert tensor.A is not None
|
||||||
|
assert tensor.B is not None
|
||||||
|
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
dest = super().modify_tensors(data_torch, name, bid)
|
||||||
|
for dest_name, dest_data in dest:
|
||||||
|
assert isinstance(dest_data, LoraTorchTensor)
|
||||||
|
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||||
|
|
||||||
|
yield (dest_name + ".lora_a", lora_a)
|
||||||
|
yield (dest_name + ".lora_b", lora_b)
|
||||||
|
|
||||||
|
with open(lora_config, "r") as f:
|
||||||
|
lparams: dict[str, Any] = json.load(f)
|
||||||
|
|
||||||
|
alpha: float = lparams["lora_alpha"]
|
||||||
|
|
||||||
|
model_instance = LoraModel(
|
||||||
|
dir_base_model,
|
||||||
|
ftype,
|
||||||
|
fname_out,
|
||||||
|
is_big_endian=args.bigendian,
|
||||||
|
use_temp_file=False,
|
||||||
|
eager=args.no_lazy,
|
||||||
|
dry_run=args.dry_run,
|
||||||
|
dir_lora_model=dir_lora,
|
||||||
|
lora_alpha=alpha,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("Exporting model...")
|
||||||
|
model_instance.write()
|
||||||
|
logger.info(f"Model successfully exported to {model_instance.fname_out}")
|
|
@ -0,0 +1,9 @@
|
||||||
|
from .constants import *
|
||||||
|
from .lazy import *
|
||||||
|
from .gguf_reader import *
|
||||||
|
from .gguf_writer import *
|
||||||
|
from .quants import *
|
||||||
|
from .tensor_mapping import *
|
||||||
|
from .vocab import *
|
||||||
|
from .utility import *
|
||||||
|
from .metadata import *
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,15 @@
|
||||||
|
# This file left for compatibility. If you want to use the GGUF API from Python
|
||||||
|
# then don't import gguf/gguf.py directly. If you're looking for examples, see the
|
||||||
|
# examples/ directory for gguf-py
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
# Compatibility for people trying to import gguf/gguf.py directly instead of as a package.
|
||||||
|
importlib.invalidate_caches()
|
||||||
|
import gguf # noqa: E402
|
||||||
|
|
||||||
|
importlib.reload(gguf)
|
|
@ -0,0 +1,317 @@
|
||||||
|
#
|
||||||
|
# GGUF file reading/modification support. For API usage information,
|
||||||
|
# please see the files scripts/ for some fairly simple examples.
|
||||||
|
#
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from collections import OrderedDict
|
||||||
|
from typing import Any, Literal, NamedTuple, TypeVar, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import numpy.typing as npt
|
||||||
|
|
||||||
|
from .quants import quant_shape_to_byte_shape
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Allow running file in package as a script.
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from gguf.constants import (
|
||||||
|
GGML_QUANT_SIZES,
|
||||||
|
GGUF_DEFAULT_ALIGNMENT,
|
||||||
|
GGUF_MAGIC,
|
||||||
|
GGUF_VERSION,
|
||||||
|
GGMLQuantizationType,
|
||||||
|
GGUFValueType,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]
|
||||||
|
|
||||||
|
|
||||||
|
class ReaderField(NamedTuple):
|
||||||
|
# Offset to start of this field.
|
||||||
|
offset: int
|
||||||
|
|
||||||
|
# Name of the field (not necessarily from file data).
|
||||||
|
name: str
|
||||||
|
|
||||||
|
# Data parts. Some types have multiple components, such as strings
|
||||||
|
# that consist of a length followed by the string data.
|
||||||
|
parts: list[npt.NDArray[Any]] = []
|
||||||
|
|
||||||
|
# Indexes into parts that we can call the actual data. For example
|
||||||
|
# an array of strings will be populated with indexes to the actual
|
||||||
|
# string data.
|
||||||
|
data: list[int] = [-1]
|
||||||
|
|
||||||
|
types: list[GGUFValueType] = []
|
||||||
|
|
||||||
|
|
||||||
|
class ReaderTensor(NamedTuple):
|
||||||
|
name: str
|
||||||
|
tensor_type: GGMLQuantizationType
|
||||||
|
shape: npt.NDArray[np.uint32]
|
||||||
|
n_elements: int
|
||||||
|
n_bytes: int
|
||||||
|
data_offset: int
|
||||||
|
data: npt.NDArray[Any]
|
||||||
|
field: ReaderField
|
||||||
|
|
||||||
|
|
||||||
|
class GGUFReader:
|
||||||
|
# I - same as host, S - swapped
|
||||||
|
byte_order: Literal['I', 'S'] = 'I'
|
||||||
|
alignment: int = GGUF_DEFAULT_ALIGNMENT
|
||||||
|
data_offset: int
|
||||||
|
|
||||||
|
# Note: Internal helper, API may change.
|
||||||
|
gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
|
||||||
|
GGUFValueType.UINT8: np.uint8,
|
||||||
|
GGUFValueType.INT8: np.int8,
|
||||||
|
GGUFValueType.UINT16: np.uint16,
|
||||||
|
GGUFValueType.INT16: np.int16,
|
||||||
|
GGUFValueType.UINT32: np.uint32,
|
||||||
|
GGUFValueType.INT32: np.int32,
|
||||||
|
GGUFValueType.FLOAT32: np.float32,
|
||||||
|
GGUFValueType.UINT64: np.uint64,
|
||||||
|
GGUFValueType.INT64: np.int64,
|
||||||
|
GGUFValueType.FLOAT64: np.float64,
|
||||||
|
GGUFValueType.BOOL: np.bool_,
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] = 'r'):
|
||||||
|
self.data = np.memmap(path, mode = mode)
|
||||||
|
offs = 0
|
||||||
|
|
||||||
|
# Check for GGUF magic
|
||||||
|
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
|
||||||
|
raise ValueError('GGUF magic invalid')
|
||||||
|
offs += 4
|
||||||
|
|
||||||
|
# Check GGUF version
|
||||||
|
temp_version = self._get(offs, np.uint32)
|
||||||
|
if temp_version[0] & 65535 == 0:
|
||||||
|
# If we get 0 here that means it's (probably) a GGUF file created for
|
||||||
|
# the opposite byte order of the machine this script is running on.
|
||||||
|
self.byte_order = 'S'
|
||||||
|
temp_version = temp_version.newbyteorder(self.byte_order)
|
||||||
|
version = temp_version[0]
|
||||||
|
if version not in READER_SUPPORTED_VERSIONS:
|
||||||
|
raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle')
|
||||||
|
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
|
||||||
|
self.tensors: list[ReaderTensor] = []
|
||||||
|
offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
|
||||||
|
|
||||||
|
# Check tensor count and kv count
|
||||||
|
temp_counts = self._get(offs, np.uint64, 2)
|
||||||
|
offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
|
||||||
|
offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
|
||||||
|
tensor_count, kv_count = temp_counts
|
||||||
|
offs = self._build_fields(offs, kv_count)
|
||||||
|
|
||||||
|
# Build Tensor Info Fields
|
||||||
|
offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
|
||||||
|
new_align = self.fields.get('general.alignment')
|
||||||
|
if new_align is not None:
|
||||||
|
if new_align.types != [GGUFValueType.UINT32]:
|
||||||
|
raise ValueError('Bad type for general.alignment field')
|
||||||
|
self.alignment = new_align.parts[-1][0]
|
||||||
|
padding = offs % self.alignment
|
||||||
|
if padding != 0:
|
||||||
|
offs += self.alignment - padding
|
||||||
|
self.data_offset = offs
|
||||||
|
self._build_tensors(offs, tensors_fields)
|
||||||
|
|
||||||
|
_DT = TypeVar('_DT', bound = npt.DTypeLike)
|
||||||
|
|
||||||
|
# Fetch a key/value metadata field by key.
|
||||||
|
def get_field(self, key: str) -> Union[ReaderField, None]:
|
||||||
|
return self.fields.get(key, None)
|
||||||
|
|
||||||
|
# Fetch a tensor from the list by index.
|
||||||
|
def get_tensor(self, idx: int) -> ReaderTensor:
|
||||||
|
return self.tensors[idx]
|
||||||
|
|
||||||
|
def _get(
|
||||||
|
self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I', 'S', '<'] = None,
|
||||||
|
) -> npt.NDArray[Any]:
|
||||||
|
count = int(count)
|
||||||
|
itemsize = int(np.empty([], dtype = dtype).itemsize)
|
||||||
|
end_offs = offset + itemsize * count
|
||||||
|
return (
|
||||||
|
self.data[offset:end_offs]
|
||||||
|
.view(dtype = dtype)[:count]
|
||||||
|
.newbyteorder(override_order or self.byte_order)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
|
||||||
|
if field.name in self.fields:
|
||||||
|
# TODO: add option to generate error on duplicate keys
|
||||||
|
# raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
|
||||||
|
|
||||||
|
logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
|
||||||
|
self.fields[field.name + '_{}'.format(field.offset)] = field
|
||||||
|
else:
|
||||||
|
self.fields[field.name] = field
|
||||||
|
return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts)
|
||||||
|
|
||||||
|
def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]:
|
||||||
|
slen = self._get(offset, np.uint64)
|
||||||
|
return slen, self._get(offset + 8, np.uint8, slen[0])
|
||||||
|
|
||||||
|
def _get_field_parts(
|
||||||
|
self, orig_offs: int, raw_type: int,
|
||||||
|
) -> tuple[int, list[npt.NDArray[Any]], list[int], list[GGUFValueType]]:
|
||||||
|
offs = orig_offs
|
||||||
|
types: list[GGUFValueType] = []
|
||||||
|
gtype = GGUFValueType(raw_type)
|
||||||
|
types.append(gtype)
|
||||||
|
# Handle strings.
|
||||||
|
if gtype == GGUFValueType.STRING:
|
||||||
|
sparts: list[npt.NDArray[Any]] = list(self._get_str(offs))
|
||||||
|
size = sum(int(part.nbytes) for part in sparts)
|
||||||
|
return size, sparts, [1], types
|
||||||
|
# Check if it's a simple scalar type.
|
||||||
|
nptype = self.gguf_scalar_to_np.get(gtype)
|
||||||
|
if nptype is not None:
|
||||||
|
val = self._get(offs, nptype)
|
||||||
|
return int(val.nbytes), [val], [0], types
|
||||||
|
# Handle arrays.
|
||||||
|
if gtype == GGUFValueType.ARRAY:
|
||||||
|
raw_itype = self._get(offs, np.uint32)
|
||||||
|
offs += int(raw_itype.nbytes)
|
||||||
|
alen = self._get(offs, np.uint64)
|
||||||
|
offs += int(alen.nbytes)
|
||||||
|
aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
|
||||||
|
data_idxs: list[int] = []
|
||||||
|
for idx in range(alen[0]):
|
||||||
|
curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0])
|
||||||
|
if idx == 0:
|
||||||
|
types += curr_types
|
||||||
|
idxs_offs = len(aparts)
|
||||||
|
aparts += curr_parts
|
||||||
|
data_idxs += (idx + idxs_offs for idx in curr_idxs)
|
||||||
|
offs += curr_size
|
||||||
|
return offs - orig_offs, aparts, data_idxs, types
|
||||||
|
# We can't deal with this one.
|
||||||
|
raise ValueError('Unknown/unhandled field type {gtype}')
|
||||||
|
|
||||||
|
def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
|
||||||
|
offs = orig_offs
|
||||||
|
|
||||||
|
# Get Tensor Name
|
||||||
|
name_len, name_data = self._get_str(offs)
|
||||||
|
offs += int(name_len.nbytes + name_data.nbytes)
|
||||||
|
|
||||||
|
# Get Tensor Dimensions Count
|
||||||
|
n_dims = self._get(offs, np.uint32)
|
||||||
|
offs += int(n_dims.nbytes)
|
||||||
|
|
||||||
|
# Get Tensor Dimension Array
|
||||||
|
dims = self._get(offs, np.uint64, n_dims[0])
|
||||||
|
offs += int(dims.nbytes)
|
||||||
|
|
||||||
|
# Get Tensor Encoding Scheme Type
|
||||||
|
raw_dtype = self._get(offs, np.uint32)
|
||||||
|
offs += int(raw_dtype.nbytes)
|
||||||
|
|
||||||
|
# Get Tensor Offset
|
||||||
|
offset_tensor = self._get(offs, np.uint64)
|
||||||
|
offs += int(offset_tensor.nbytes)
|
||||||
|
|
||||||
|
return ReaderField(
|
||||||
|
orig_offs,
|
||||||
|
str(bytes(name_data), encoding = 'utf-8'),
|
||||||
|
[name_len, name_data, n_dims, dims, raw_dtype, offset_tensor],
|
||||||
|
[1, 3, 4, 5],
|
||||||
|
)
|
||||||
|
|
||||||
|
def _build_fields(self, offs: int, count: int) -> int:
|
||||||
|
for _ in range(count):
|
||||||
|
orig_offs = offs
|
||||||
|
kv_klen, kv_kdata = self._get_str(offs)
|
||||||
|
offs += int(kv_klen.nbytes + kv_kdata.nbytes)
|
||||||
|
raw_kv_type = self._get(offs, np.uint32)
|
||||||
|
offs += int(raw_kv_type.nbytes)
|
||||||
|
parts: list[npt.NDArray[Any]] = [kv_klen, kv_kdata, raw_kv_type]
|
||||||
|
idxs_offs = len(parts)
|
||||||
|
field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0])
|
||||||
|
parts += field_parts
|
||||||
|
self._push_field(ReaderField(
|
||||||
|
orig_offs,
|
||||||
|
str(bytes(kv_kdata), encoding = 'utf-8'),
|
||||||
|
parts,
|
||||||
|
[idx + idxs_offs for idx in field_idxs],
|
||||||
|
field_types,
|
||||||
|
), skip_sum = True)
|
||||||
|
offs += field_size
|
||||||
|
return offs
|
||||||
|
|
||||||
|
def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
|
||||||
|
tensor_fields = []
|
||||||
|
for _ in range(count):
|
||||||
|
field = self._get_tensor_info_field(offs)
|
||||||
|
offs += sum(int(part.nbytes) for part in field.parts)
|
||||||
|
tensor_fields.append(field)
|
||||||
|
return offs, tensor_fields
|
||||||
|
|
||||||
|
def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
|
||||||
|
tensors = []
|
||||||
|
tensor_names = set() # keep track of name to prevent duplicated tensors
|
||||||
|
for field in fields:
|
||||||
|
_name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts
|
||||||
|
# check if there's any tensor having same name already in the list
|
||||||
|
tensor_name = str(bytes(name_data), encoding = 'utf-8')
|
||||||
|
if tensor_name in tensor_names:
|
||||||
|
raise ValueError(f'Found duplicated tensor with name {tensor_name}')
|
||||||
|
tensor_names.add(tensor_name)
|
||||||
|
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
||||||
|
n_elems = int(np.prod(dims))
|
||||||
|
np_dims = tuple(reversed(dims.tolist()))
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
||||||
|
n_bytes = n_elems * type_size // block_size
|
||||||
|
data_offs = int(start_offs + offset_tensor[0])
|
||||||
|
item_type: npt.DTypeLike
|
||||||
|
if ggml_type == GGMLQuantizationType.F16:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.float16
|
||||||
|
elif ggml_type == GGMLQuantizationType.F32:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.float32
|
||||||
|
elif ggml_type == GGMLQuantizationType.F64:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.float64
|
||||||
|
elif ggml_type == GGMLQuantizationType.I8:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.int8
|
||||||
|
elif ggml_type == GGMLQuantizationType.I16:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.int16
|
||||||
|
elif ggml_type == GGMLQuantizationType.I32:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.int32
|
||||||
|
elif ggml_type == GGMLQuantizationType.I64:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.int64
|
||||||
|
else:
|
||||||
|
item_count = n_bytes
|
||||||
|
item_type = np.uint8
|
||||||
|
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
|
||||||
|
tensors.append(ReaderTensor(
|
||||||
|
name = tensor_name,
|
||||||
|
tensor_type = ggml_type,
|
||||||
|
shape = dims,
|
||||||
|
n_elements = n_elems,
|
||||||
|
n_bytes = n_bytes,
|
||||||
|
data_offset = data_offs,
|
||||||
|
data = self._get(data_offs, item_type, item_count).reshape(np_dims),
|
||||||
|
field = field,
|
||||||
|
))
|
||||||
|
self.tensors = tensors
|
|
@ -0,0 +1,882 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import struct
|
||||||
|
import tempfile
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import Enum, auto
|
||||||
|
from math import prod
|
||||||
|
from pathlib import Path
|
||||||
|
from io import BufferedWriter
|
||||||
|
from typing import IO, Any, Sequence, Mapping
|
||||||
|
from string import ascii_letters, digits
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .constants import (
|
||||||
|
GGUF_DEFAULT_ALIGNMENT,
|
||||||
|
GGUF_MAGIC,
|
||||||
|
GGUF_VERSION,
|
||||||
|
GGMLQuantizationType,
|
||||||
|
GGUFEndian,
|
||||||
|
GGUFValueType,
|
||||||
|
Keys,
|
||||||
|
RopeScalingType,
|
||||||
|
PoolingType,
|
||||||
|
TokenType,
|
||||||
|
)
|
||||||
|
|
||||||
|
from .quants import quant_shape_from_byte_shape
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TensorInfo:
|
||||||
|
shape: Sequence[int]
|
||||||
|
dtype: GGMLQuantizationType
|
||||||
|
nbytes: int
|
||||||
|
tensor: np.ndarray[Any, Any] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GGUFValue:
|
||||||
|
value: Any
|
||||||
|
type: GGUFValueType
|
||||||
|
|
||||||
|
|
||||||
|
class WriterState(Enum):
|
||||||
|
NO_FILE = auto()
|
||||||
|
EMPTY = auto()
|
||||||
|
HEADER = auto()
|
||||||
|
KV_DATA = auto()
|
||||||
|
TI_DATA = auto()
|
||||||
|
WEIGHTS = auto()
|
||||||
|
|
||||||
|
|
||||||
|
class GGUFWriter:
|
||||||
|
fout: list[BufferedWriter] | None
|
||||||
|
path: Path | None
|
||||||
|
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
||||||
|
tensors: list[dict[str, TensorInfo]]
|
||||||
|
kv_data: list[dict[str, GGUFValue]]
|
||||||
|
state: WriterState
|
||||||
|
_simple_value_packing = {
|
||||||
|
GGUFValueType.UINT8: "B",
|
||||||
|
GGUFValueType.INT8: "b",
|
||||||
|
GGUFValueType.UINT16: "H",
|
||||||
|
GGUFValueType.INT16: "h",
|
||||||
|
GGUFValueType.UINT32: "I",
|
||||||
|
GGUFValueType.INT32: "i",
|
||||||
|
GGUFValueType.FLOAT32: "f",
|
||||||
|
GGUFValueType.UINT64: "Q",
|
||||||
|
GGUFValueType.INT64: "q",
|
||||||
|
GGUFValueType.FLOAT64: "d",
|
||||||
|
GGUFValueType.BOOL: "?",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
|
||||||
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
|
||||||
|
):
|
||||||
|
self.fout = None
|
||||||
|
self.path = Path(path) if path else None
|
||||||
|
self.arch = arch
|
||||||
|
self.endianess = endianess
|
||||||
|
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
||||||
|
self.use_temp_file = use_temp_file
|
||||||
|
self.temp_file = None
|
||||||
|
self.tensors = [{}]
|
||||||
|
self.kv_data = [{}]
|
||||||
|
self.split_max_tensors = split_max_tensors
|
||||||
|
self.split_max_size = split_max_size
|
||||||
|
self.dry_run = dry_run
|
||||||
|
self.small_first_shard = small_first_shard
|
||||||
|
logger.info("gguf: This GGUF file is for {0} Endian only".format(
|
||||||
|
"Big" if self.endianess == GGUFEndian.BIG else "Little",
|
||||||
|
))
|
||||||
|
self.state = WriterState.NO_FILE
|
||||||
|
|
||||||
|
if self.small_first_shard:
|
||||||
|
self.tensors.append({})
|
||||||
|
|
||||||
|
self.add_architecture()
|
||||||
|
|
||||||
|
def get_total_parameter_count(self) -> tuple[int, int, int, int]:
|
||||||
|
total_params = 0
|
||||||
|
shared_params = 0
|
||||||
|
expert_params = 0
|
||||||
|
|
||||||
|
expert_sum = 0
|
||||||
|
n_expert_tensors = 0
|
||||||
|
|
||||||
|
last_lora_a: tuple[str, TensorInfo] | None = None
|
||||||
|
|
||||||
|
for tensors in self.tensors:
|
||||||
|
for name, info in tensors.items():
|
||||||
|
|
||||||
|
shape = info.shape
|
||||||
|
|
||||||
|
if name.endswith(".lora_a"):
|
||||||
|
last_lora_a = (name, info)
|
||||||
|
continue
|
||||||
|
elif name.endswith(".lora_b"):
|
||||||
|
if last_lora_a is None or last_lora_a[0] != name[:-1] + "a":
|
||||||
|
# Bail when the LoRA pair can't be found trivially
|
||||||
|
logger.warning("can't measure LoRA size correctly, tensor order is unusual")
|
||||||
|
return 0, 0, 0, 0
|
||||||
|
else:
|
||||||
|
shape = (*shape[:-1], last_lora_a[1].shape[-1])
|
||||||
|
|
||||||
|
size = prod(shape)
|
||||||
|
|
||||||
|
if "_exps." in name:
|
||||||
|
expert_params += (size // shape[-3])
|
||||||
|
expert_sum += shape[-3]
|
||||||
|
n_expert_tensors += 1
|
||||||
|
else:
|
||||||
|
shared_params += size
|
||||||
|
|
||||||
|
total_params += size
|
||||||
|
|
||||||
|
# Hopefully this should work even for variable-expert-count models
|
||||||
|
expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0
|
||||||
|
|
||||||
|
# Negate the total to signal it's likely not exact
|
||||||
|
if last_lora_a is not None:
|
||||||
|
total_params = -total_params
|
||||||
|
|
||||||
|
# NOTE: keep the output in the same order as accepted by 'size_label' in gguf-py/gguf/utility.py
|
||||||
|
return total_params, shared_params, expert_params, expert_count
|
||||||
|
|
||||||
|
def format_shard_names(self, path: Path) -> list[Path]:
|
||||||
|
if len(self.tensors) == 1:
|
||||||
|
return [path]
|
||||||
|
return [path.with_name(SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))) for i in range(len(self.tensors))]
|
||||||
|
|
||||||
|
def open_output_file(self, path: Path | None = None) -> None:
|
||||||
|
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
|
||||||
|
# allow calling this multiple times as long as the path is the same
|
||||||
|
return
|
||||||
|
|
||||||
|
if self.state is not WriterState.NO_FILE:
|
||||||
|
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
|
||||||
|
|
||||||
|
if path is not None:
|
||||||
|
self.path = path
|
||||||
|
|
||||||
|
if self.path is not None:
|
||||||
|
filenames = self.print_plan()
|
||||||
|
self.fout = [open(filename, "wb") for filename in filenames]
|
||||||
|
self.state = WriterState.EMPTY
|
||||||
|
|
||||||
|
def print_plan(self) -> list[Path]:
|
||||||
|
logger.info("Writing the following files:")
|
||||||
|
assert self.path is not None
|
||||||
|
filenames = self.format_shard_names(self.path)
|
||||||
|
assert len(filenames) == len(self.tensors)
|
||||||
|
for name, tensors in zip(filenames, self.tensors):
|
||||||
|
logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")
|
||||||
|
|
||||||
|
if self.dry_run:
|
||||||
|
logger.info("Dry run, not writing files")
|
||||||
|
for name in filenames:
|
||||||
|
print(name) # noqa: NP100
|
||||||
|
exit()
|
||||||
|
|
||||||
|
return filenames
|
||||||
|
|
||||||
|
def add_shard_kv_data(self) -> None:
|
||||||
|
if len(self.tensors) == 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
total_tensors = sum(len(t) for t in self.tensors)
|
||||||
|
assert self.fout is not None
|
||||||
|
total_splits = len(self.fout)
|
||||||
|
self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits))
|
||||||
|
for i, kv_data in enumerate(self.kv_data):
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16)
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16)
|
||||||
|
kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32)
|
||||||
|
|
||||||
|
def write_header_to_file(self, path: Path | None = None) -> None:
|
||||||
|
if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0):
|
||||||
|
logger.warning("Model fails split requirements, not splitting")
|
||||||
|
|
||||||
|
self.open_output_file(path)
|
||||||
|
|
||||||
|
if self.state is not WriterState.EMPTY:
|
||||||
|
raise ValueError(f'Expected output file to be empty, got {self.state}')
|
||||||
|
|
||||||
|
assert self.fout is not None
|
||||||
|
assert len(self.fout) == len(self.tensors)
|
||||||
|
assert len(self.kv_data) == 1
|
||||||
|
|
||||||
|
self.add_shard_kv_data()
|
||||||
|
|
||||||
|
for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data):
|
||||||
|
fout.write(self._pack("<I", GGUF_MAGIC, skip_pack_prefix = True))
|
||||||
|
fout.write(self._pack("I", GGUF_VERSION))
|
||||||
|
fout.write(self._pack("Q", len(tensors)))
|
||||||
|
fout.write(self._pack("Q", len(kv_data)))
|
||||||
|
fout.flush()
|
||||||
|
self.state = WriterState.HEADER
|
||||||
|
|
||||||
|
def write_kv_data_to_file(self) -> None:
|
||||||
|
if self.state is not WriterState.HEADER:
|
||||||
|
raise ValueError(f'Expected output file to contain the header, got {self.state}')
|
||||||
|
assert self.fout is not None
|
||||||
|
|
||||||
|
for fout, kv_data in zip(self.fout, self.kv_data):
|
||||||
|
kv_bytes = bytearray()
|
||||||
|
|
||||||
|
for key, val in kv_data.items():
|
||||||
|
kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
|
||||||
|
kv_bytes += self._pack_val(val.value, val.type, add_vtype=True)
|
||||||
|
|
||||||
|
fout.write(kv_bytes)
|
||||||
|
|
||||||
|
self.flush()
|
||||||
|
self.state = WriterState.KV_DATA
|
||||||
|
|
||||||
|
def write_ti_data_to_file(self) -> None:
|
||||||
|
if self.state is not WriterState.KV_DATA:
|
||||||
|
raise ValueError(f'Expected output file to contain KV data, got {self.state}')
|
||||||
|
assert self.fout is not None
|
||||||
|
|
||||||
|
for fout, tensors in zip(self.fout, self.tensors):
|
||||||
|
ti_data = bytearray()
|
||||||
|
offset_tensor = 0
|
||||||
|
|
||||||
|
for name, ti in tensors.items():
|
||||||
|
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
|
||||||
|
n_dims = len(ti.shape)
|
||||||
|
ti_data += self._pack("I", n_dims)
|
||||||
|
for j in range(n_dims):
|
||||||
|
ti_data += self._pack("Q", ti.shape[n_dims - 1 - j])
|
||||||
|
ti_data += self._pack("I", ti.dtype)
|
||||||
|
ti_data += self._pack("Q", offset_tensor)
|
||||||
|
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
|
||||||
|
|
||||||
|
fout.write(ti_data)
|
||||||
|
fout.flush()
|
||||||
|
self.state = WriterState.TI_DATA
|
||||||
|
|
||||||
|
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
|
||||||
|
if any(key in kv_data for kv_data in self.kv_data):
|
||||||
|
raise ValueError(f'Duplicated key name {key!r}')
|
||||||
|
|
||||||
|
self.kv_data[0][key] = GGUFValue(value=val, type=vtype)
|
||||||
|
|
||||||
|
def add_uint8(self, key: str, val: int) -> None:
|
||||||
|
self.add_key_value(key,val, GGUFValueType.UINT8)
|
||||||
|
|
||||||
|
def add_int8(self, key: str, val: int) -> None:
|
||||||
|
self.add_key_value(key, val, GGUFValueType.INT8)
|
||||||
|
|
||||||
|
def add_uint16(self, key: str, val: int) -> None:
|
||||||
|
self.add_key_value(key, val, GGUFValueType.UINT16)
|
||||||
|
|
||||||
|
def add_int16(self, key: str, val: int) -> None:
|
||||||
|
self.add_key_value(key, val, GGUFValueType.INT16)
|
||||||
|
|
||||||
|
def add_uint32(self, key: str, val: int) -> None:
|
||||||
|
self.add_key_value(key, val, GGUFValueType.UINT32)
|
||||||
|
|
||||||
|
def add_int32(self, key: str, val: int) -> None:
|
||||||
|
self.add_key_value(key, val, GGUFValueType.INT32)
|
||||||
|
|
||||||
|
def add_float32(self, key: str, val: float) -> None:
|
||||||
|
self.add_key_value(key, val, GGUFValueType.FLOAT32)
|
||||||
|
|
||||||
|
def add_uint64(self, key: str, val: int) -> None:
|
||||||
|
self.add_key_value(key, val, GGUFValueType.UINT64)
|
||||||
|
|
||||||
|
def add_int64(self, key: str, val: int) -> None:
|
||||||
|
self.add_key_value(key, val, GGUFValueType.INT64)
|
||||||
|
|
||||||
|
def add_float64(self, key: str, val: float) -> None:
|
||||||
|
self.add_key_value(key, val, GGUFValueType.FLOAT64)
|
||||||
|
|
||||||
|
def add_bool(self, key: str, val: bool) -> None:
|
||||||
|
self.add_key_value(key, val, GGUFValueType.BOOL)
|
||||||
|
|
||||||
|
def add_string(self, key: str, val: str) -> None:
|
||||||
|
if not val:
|
||||||
|
return
|
||||||
|
self.add_key_value(key, val, GGUFValueType.STRING)
|
||||||
|
|
||||||
|
def add_array(self, key: str, val: Sequence[Any]) -> None:
|
||||||
|
if len(val) == 0:
|
||||||
|
return
|
||||||
|
self.add_key_value(key, val, GGUFValueType.ARRAY)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def ggml_pad(x: int, n: int) -> int:
|
||||||
|
return ((x + n - 1) // n) * n
|
||||||
|
|
||||||
|
def add_tensor_info(
|
||||||
|
self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
|
||||||
|
tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
|
||||||
|
) -> None:
|
||||||
|
if self.state is not WriterState.NO_FILE:
|
||||||
|
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
|
||||||
|
|
||||||
|
if any(name in tensors for tensors in self.tensors):
|
||||||
|
raise ValueError(f'Duplicated tensor name {name!r}')
|
||||||
|
|
||||||
|
if raw_dtype is None:
|
||||||
|
if tensor_dtype == np.float16:
|
||||||
|
dtype = GGMLQuantizationType.F16
|
||||||
|
elif tensor_dtype == np.float32:
|
||||||
|
dtype = GGMLQuantizationType.F32
|
||||||
|
elif tensor_dtype == np.float64:
|
||||||
|
dtype = GGMLQuantizationType.F64
|
||||||
|
elif tensor_dtype == np.int8:
|
||||||
|
dtype = GGMLQuantizationType.I8
|
||||||
|
elif tensor_dtype == np.int16:
|
||||||
|
dtype = GGMLQuantizationType.I16
|
||||||
|
elif tensor_dtype == np.int32:
|
||||||
|
dtype = GGMLQuantizationType.I32
|
||||||
|
elif tensor_dtype == np.int64:
|
||||||
|
dtype = GGMLQuantizationType.I64
|
||||||
|
else:
|
||||||
|
raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
|
||||||
|
else:
|
||||||
|
dtype = raw_dtype
|
||||||
|
if tensor_dtype == np.uint8:
|
||||||
|
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
||||||
|
|
||||||
|
# make sure there is at least one tensor before splitting
|
||||||
|
if len(self.tensors[-1]) > 0:
|
||||||
|
if ( # split when over tensor limit
|
||||||
|
self.split_max_tensors != 0
|
||||||
|
and len(self.tensors[-1]) >= self.split_max_tensors
|
||||||
|
) or ( # split when over size limit
|
||||||
|
self.split_max_size != 0
|
||||||
|
and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size
|
||||||
|
):
|
||||||
|
self.tensors.append({})
|
||||||
|
|
||||||
|
self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
|
||||||
|
|
||||||
|
def add_tensor(
|
||||||
|
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
||||||
|
raw_dtype: GGMLQuantizationType | None = None,
|
||||||
|
) -> None:
|
||||||
|
if self.endianess == GGUFEndian.BIG:
|
||||||
|
tensor.byteswap(inplace=True)
|
||||||
|
if self.use_temp_file and self.temp_file is None:
|
||||||
|
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024)
|
||||||
|
fp.seek(0)
|
||||||
|
self.temp_file = fp
|
||||||
|
|
||||||
|
shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
|
||||||
|
self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
|
||||||
|
|
||||||
|
if self.temp_file is None:
|
||||||
|
self.tensors[-1][name].tensor = tensor
|
||||||
|
return
|
||||||
|
|
||||||
|
tensor.tofile(self.temp_file)
|
||||||
|
self.write_padding(self.temp_file, tensor.nbytes)
|
||||||
|
|
||||||
|
def write_padding(self, fp: IO[bytes], n: int, align: int | None = None) -> None:
|
||||||
|
pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
|
||||||
|
if pad != 0:
|
||||||
|
fp.write(bytes([0] * pad))
|
||||||
|
|
||||||
|
def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
|
||||||
|
if self.state is not WriterState.TI_DATA and self.state is not WriterState.WEIGHTS:
|
||||||
|
raise ValueError(f'Expected output file to contain tensor info or weights, got {self.state}')
|
||||||
|
assert self.fout is not None
|
||||||
|
|
||||||
|
if self.endianess == GGUFEndian.BIG:
|
||||||
|
tensor.byteswap(inplace=True)
|
||||||
|
|
||||||
|
file_id = -1
|
||||||
|
for i, tensors in enumerate(self.tensors):
|
||||||
|
if len(tensors) > 0:
|
||||||
|
file_id = i
|
||||||
|
break
|
||||||
|
|
||||||
|
fout = self.fout[file_id]
|
||||||
|
|
||||||
|
# pop the first tensor info
|
||||||
|
# TODO: cleaner way to get the first key
|
||||||
|
first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
|
||||||
|
ti = self.tensors[file_id].pop(first_tensor_name)
|
||||||
|
assert ti.nbytes == tensor.nbytes
|
||||||
|
|
||||||
|
self.write_padding(fout, fout.tell())
|
||||||
|
tensor.tofile(fout)
|
||||||
|
self.write_padding(fout, tensor.nbytes)
|
||||||
|
|
||||||
|
self.state = WriterState.WEIGHTS
|
||||||
|
|
||||||
|
def write_tensors_to_file(self, *, progress: bool = False) -> None:
|
||||||
|
self.write_ti_data_to_file()
|
||||||
|
|
||||||
|
assert self.fout is not None
|
||||||
|
|
||||||
|
for fout in self.fout:
|
||||||
|
self.write_padding(fout, fout.tell())
|
||||||
|
|
||||||
|
if self.temp_file is None:
|
||||||
|
shard_bar = None
|
||||||
|
bar = None
|
||||||
|
|
||||||
|
if progress:
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values())
|
||||||
|
|
||||||
|
if len(self.fout) > 1:
|
||||||
|
shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True)
|
||||||
|
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||||
|
|
||||||
|
for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)):
|
||||||
|
if shard_bar is not None:
|
||||||
|
shard_bar.set_description(f"Shard ({i + 1}/{len(self.fout)})")
|
||||||
|
total = sum(ti.nbytes for ti in tensors.values())
|
||||||
|
shard_bar.reset(total=(total if total > 0 else None))
|
||||||
|
|
||||||
|
# relying on the fact that Python dicts preserve insertion order (since 3.7)
|
||||||
|
for ti in tensors.values():
|
||||||
|
assert ti.tensor is not None # can only iterate once over the tensors
|
||||||
|
assert ti.tensor.nbytes == ti.nbytes
|
||||||
|
ti.tensor.tofile(fout)
|
||||||
|
if shard_bar is not None:
|
||||||
|
shard_bar.update(ti.nbytes)
|
||||||
|
if bar is not None:
|
||||||
|
bar.update(ti.nbytes)
|
||||||
|
self.write_padding(fout, ti.nbytes)
|
||||||
|
ti.tensor = None
|
||||||
|
else:
|
||||||
|
self.temp_file.seek(0)
|
||||||
|
|
||||||
|
shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1])
|
||||||
|
self.flush()
|
||||||
|
self.temp_file.close()
|
||||||
|
|
||||||
|
self.state = WriterState.WEIGHTS
|
||||||
|
|
||||||
|
def flush(self) -> None:
|
||||||
|
assert self.fout is not None
|
||||||
|
for fout in self.fout:
|
||||||
|
fout.flush()
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
if self.fout is not None:
|
||||||
|
for fout in self.fout:
|
||||||
|
fout.close()
|
||||||
|
self.fout = None
|
||||||
|
|
||||||
|
def add_type(self, type_name: str) -> None:
|
||||||
|
self.add_string(Keys.General.TYPE, type_name)
|
||||||
|
|
||||||
|
def add_architecture(self) -> None:
|
||||||
|
self.add_string(Keys.General.ARCHITECTURE, self.arch)
|
||||||
|
|
||||||
|
def add_quantization_version(self, quantization_version: int) -> None:
|
||||||
|
self.add_uint32(Keys.General.QUANTIZATION_VERSION, quantization_version)
|
||||||
|
|
||||||
|
def add_custom_alignment(self, alignment: int) -> None:
|
||||||
|
self.data_alignment = alignment
|
||||||
|
self.add_uint32(Keys.General.ALIGNMENT, alignment)
|
||||||
|
|
||||||
|
def add_file_type(self, ftype: int) -> None:
|
||||||
|
self.add_uint32(Keys.General.FILE_TYPE, ftype)
|
||||||
|
|
||||||
|
def add_name(self, name: str) -> None:
|
||||||
|
self.add_string(Keys.General.NAME, name)
|
||||||
|
|
||||||
|
def add_author(self, author: str) -> None:
|
||||||
|
self.add_string(Keys.General.AUTHOR, author)
|
||||||
|
|
||||||
|
def add_version(self, version: str) -> None:
|
||||||
|
self.add_string(Keys.General.VERSION, version)
|
||||||
|
|
||||||
|
def add_organization(self, organization: str) -> None:
|
||||||
|
self.add_string(Keys.General.ORGANIZATION, organization)
|
||||||
|
|
||||||
|
def add_finetune(self, finetune: str) -> None:
|
||||||
|
self.add_string(Keys.General.FINETUNE, finetune)
|
||||||
|
|
||||||
|
def add_basename(self, basename: str) -> None:
|
||||||
|
self.add_string(Keys.General.BASENAME, basename)
|
||||||
|
|
||||||
|
def add_description(self, description: str) -> None:
|
||||||
|
self.add_string(Keys.General.DESCRIPTION, description)
|
||||||
|
|
||||||
|
def add_quantized_by(self, quantized: str) -> None:
|
||||||
|
self.add_string(Keys.General.QUANTIZED_BY, quantized)
|
||||||
|
|
||||||
|
def add_size_label(self, size_label: str) -> None:
|
||||||
|
self.add_string(Keys.General.SIZE_LABEL, size_label)
|
||||||
|
|
||||||
|
def add_license(self, license: str) -> None:
|
||||||
|
self.add_string(Keys.General.LICENSE, license)
|
||||||
|
|
||||||
|
def add_license_name(self, license: str) -> None:
|
||||||
|
self.add_string(Keys.General.LICENSE_NAME, license)
|
||||||
|
|
||||||
|
def add_license_link(self, license: str) -> None:
|
||||||
|
self.add_string(Keys.General.LICENSE_LINK, license)
|
||||||
|
|
||||||
|
def add_url(self, url: str) -> None:
|
||||||
|
self.add_string(Keys.General.URL, url)
|
||||||
|
|
||||||
|
def add_doi(self, doi: str) -> None:
|
||||||
|
self.add_string(Keys.General.DOI, doi)
|
||||||
|
|
||||||
|
def add_uuid(self, uuid: str) -> None:
|
||||||
|
self.add_string(Keys.General.UUID, uuid)
|
||||||
|
|
||||||
|
def add_repo_url(self, repo_url: str) -> None:
|
||||||
|
self.add_string(Keys.General.REPO_URL, repo_url)
|
||||||
|
|
||||||
|
def add_source_url(self, url: str) -> None:
|
||||||
|
self.add_string(Keys.General.SOURCE_URL, url)
|
||||||
|
|
||||||
|
def add_source_doi(self, doi: str) -> None:
|
||||||
|
self.add_string(Keys.General.SOURCE_DOI, doi)
|
||||||
|
|
||||||
|
def add_source_uuid(self, uuid: str) -> None:
|
||||||
|
self.add_string(Keys.General.SOURCE_UUID, uuid)
|
||||||
|
|
||||||
|
def add_source_repo_url(self, repo_url: str) -> None:
|
||||||
|
self.add_string(Keys.General.SOURCE_REPO_URL, repo_url)
|
||||||
|
|
||||||
|
def add_base_model_count(self, source_count: int) -> None:
|
||||||
|
self.add_uint32(Keys.General.BASE_MODEL_COUNT, source_count)
|
||||||
|
|
||||||
|
def add_base_model_name(self, source_id: int, name: str) -> None:
|
||||||
|
self.add_string(Keys.General.BASE_MODEL_NAME.format(id=source_id), name)
|
||||||
|
|
||||||
|
def add_base_model_author(self, source_id: int, author: str) -> None:
|
||||||
|
self.add_string(Keys.General.BASE_MODEL_AUTHOR.format(id=source_id), author)
|
||||||
|
|
||||||
|
def add_base_model_version(self, source_id: int, version: str) -> None:
|
||||||
|
self.add_string(Keys.General.BASE_MODEL_VERSION.format(id=source_id), version)
|
||||||
|
|
||||||
|
def add_base_model_organization(self, source_id: int, organization: str) -> None:
|
||||||
|
self.add_string(Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization)
|
||||||
|
|
||||||
|
def add_base_model_url(self, source_id: int, url: str) -> None:
|
||||||
|
self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url)
|
||||||
|
|
||||||
|
def add_base_model_doi(self, source_id: int, doi: str) -> None:
|
||||||
|
self.add_string(Keys.General.BASE_MODEL_DOI.format(id=source_id), doi)
|
||||||
|
|
||||||
|
def add_base_model_uuid(self, source_id: int, uuid: str) -> None:
|
||||||
|
self.add_string(Keys.General.BASE_MODEL_UUID.format(id=source_id), uuid)
|
||||||
|
|
||||||
|
def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None:
|
||||||
|
self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url)
|
||||||
|
|
||||||
|
def add_tags(self, tags: Sequence[str]) -> None:
|
||||||
|
self.add_array(Keys.General.TAGS, tags)
|
||||||
|
|
||||||
|
def add_languages(self, languages: Sequence[str]) -> None:
|
||||||
|
self.add_array(Keys.General.LANGUAGES, languages)
|
||||||
|
|
||||||
|
def add_datasets(self, datasets: Sequence[str]) -> None:
|
||||||
|
self.add_array(Keys.General.DATASETS, datasets)
|
||||||
|
|
||||||
|
def add_tensor_data_layout(self, layout: str) -> None:
|
||||||
|
self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||||
|
|
||||||
|
def add_vocab_size(self, size: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)
|
||||||
|
|
||||||
|
def add_context_length(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_embedding_length(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_block_count(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_leading_dense_block_count(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_feed_forward_length(self, length: int | Sequence[int]) -> None:
|
||||||
|
if isinstance(length, int):
|
||||||
|
self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||||
|
else:
|
||||||
|
self.add_array(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_expert_feed_forward_length(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_expert_shared_feed_forward_length(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_parallel_residual(self, use: bool) -> None:
|
||||||
|
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||||
|
|
||||||
|
def add_decoder_start_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id)
|
||||||
|
|
||||||
|
def add_head_count(self, count: int | Sequence[int]) -> None:
|
||||||
|
if isinstance(count, int):
|
||||||
|
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
||||||
|
else:
|
||||||
|
self.add_array(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_head_count_kv(self, count: int | Sequence[int]) -> None:
|
||||||
|
if isinstance(count, int):
|
||||||
|
self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
|
||||||
|
else:
|
||||||
|
self.add_array(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_key_length(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_value_length(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_max_alibi_bias(self, bias: float) -> None:
|
||||||
|
self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)
|
||||||
|
|
||||||
|
def add_clamp_kqv(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_logit_scale(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_attn_logit_softcapping(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.LLM.ATTN_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_final_logit_softcapping(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_expert_count(self, count: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_expert_used_count(self, count: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_expert_shared_count(self, count: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.EXPERT_SHARED_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_expert_weights_scale(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_layer_norm_eps(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_layer_norm_rms_eps(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_causal_attention(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_q_lora_rank(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.Q_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_kv_lora_rank(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_relative_attn_buckets_count(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_sliding_window(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_pooling_type(self, value: PoolingType) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
def add_rope_dimension_count(self, count: int) -> None:
|
||||||
|
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_rope_freq_base(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_rope_scaling_type(self, value: RopeScalingType) -> None:
|
||||||
|
self.add_string(Keys.Rope.SCALING_TYPE.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
def add_rope_scaling_factor(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_rope_scaling_attn_factors(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_rope_scaling_finetuned(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_rope_scaling_yarn_log_mul(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_ssm_conv_kernel(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_ssm_inner_size(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.SSM.INNER_SIZE.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_ssm_state_size(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.SSM.STATE_SIZE.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_ssm_time_step_rank(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_tokenizer_model(self, model: str) -> None:
|
||||||
|
self.add_string(Keys.Tokenizer.MODEL, model)
|
||||||
|
|
||||||
|
def add_tokenizer_pre(self, pre: str) -> None:
|
||||||
|
self.add_string(Keys.Tokenizer.PRE, pre)
|
||||||
|
|
||||||
|
def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
|
||||||
|
self.add_array(Keys.Tokenizer.LIST, tokens)
|
||||||
|
|
||||||
|
def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
|
||||||
|
self.add_array(Keys.Tokenizer.MERGES, merges)
|
||||||
|
|
||||||
|
def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None:
|
||||||
|
self.add_array(Keys.Tokenizer.TOKEN_TYPE, types)
|
||||||
|
|
||||||
|
def add_token_type_count(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.TOKEN_TYPE_COUNT, value)
|
||||||
|
|
||||||
|
def add_token_scores(self, scores: Sequence[float]) -> None:
|
||||||
|
self.add_array(Keys.Tokenizer.SCORES, scores)
|
||||||
|
|
||||||
|
def add_bos_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.BOS_ID, id)
|
||||||
|
|
||||||
|
def add_eos_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.EOS_ID, id)
|
||||||
|
|
||||||
|
def add_unk_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.UNK_ID, id)
|
||||||
|
|
||||||
|
def add_sep_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.SEP_ID, id)
|
||||||
|
|
||||||
|
def add_pad_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.PAD_ID, id)
|
||||||
|
|
||||||
|
def add_cls_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.CLS_ID, id)
|
||||||
|
|
||||||
|
def add_mask_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.MASK_ID, id)
|
||||||
|
|
||||||
|
def add_add_bos_token(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Tokenizer.ADD_BOS, value)
|
||||||
|
|
||||||
|
def add_add_eos_token(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
||||||
|
|
||||||
|
def add_add_space_prefix(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
||||||
|
|
||||||
|
def add_remove_extra_whitespaces(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)
|
||||||
|
|
||||||
|
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
|
||||||
|
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
|
||||||
|
|
||||||
|
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
||||||
|
if not isinstance(value, str):
|
||||||
|
template_default = None
|
||||||
|
template_names = set()
|
||||||
|
|
||||||
|
for choice in value:
|
||||||
|
name = choice.get('name', '')
|
||||||
|
template = choice.get('template')
|
||||||
|
|
||||||
|
# Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
|
||||||
|
name = ''.join((c if c in ascii_letters + digits else '_' for c in name))
|
||||||
|
|
||||||
|
if name and template is not None:
|
||||||
|
if name == 'default':
|
||||||
|
template_default = template
|
||||||
|
else:
|
||||||
|
template_names.add(name)
|
||||||
|
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template)
|
||||||
|
|
||||||
|
if template_names:
|
||||||
|
self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names))
|
||||||
|
|
||||||
|
if template_default is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
value = template_default
|
||||||
|
|
||||||
|
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
||||||
|
|
||||||
|
def add_prefix_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
|
||||||
|
|
||||||
|
def add_suffix_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
|
||||||
|
|
||||||
|
def add_middle_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
|
||||||
|
|
||||||
|
def add_eot_token_id(self, id: int) -> None:
|
||||||
|
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
|
||||||
|
|
||||||
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||||
|
pack_prefix = ''
|
||||||
|
if not skip_pack_prefix:
|
||||||
|
pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>'
|
||||||
|
return struct.pack(f'{pack_prefix}{fmt}', value)
|
||||||
|
|
||||||
|
def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:
|
||||||
|
kv_data = bytearray()
|
||||||
|
|
||||||
|
if add_vtype:
|
||||||
|
kv_data += self._pack("I", vtype)
|
||||||
|
|
||||||
|
pack_fmt = self._simple_value_packing.get(vtype)
|
||||||
|
if pack_fmt is not None:
|
||||||
|
kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
|
||||||
|
elif vtype == GGUFValueType.STRING:
|
||||||
|
encoded_val = val.encode("utf-8") if isinstance(val, str) else val
|
||||||
|
kv_data += self._pack("Q", len(encoded_val))
|
||||||
|
kv_data += encoded_val
|
||||||
|
elif vtype == GGUFValueType.ARRAY:
|
||||||
|
|
||||||
|
if not isinstance(val, Sequence):
|
||||||
|
raise ValueError("Invalid GGUF metadata array, expecting sequence")
|
||||||
|
|
||||||
|
if len(val) == 0:
|
||||||
|
raise ValueError("Invalid GGUF metadata array. Empty array")
|
||||||
|
|
||||||
|
if isinstance(val, bytes):
|
||||||
|
ltype = GGUFValueType.UINT8
|
||||||
|
else:
|
||||||
|
ltype = GGUFValueType.get_type(val[0])
|
||||||
|
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
||||||
|
raise ValueError("All items in a GGUF array should be of the same type")
|
||||||
|
kv_data += self._pack("I", ltype)
|
||||||
|
kv_data += self._pack("Q", len(val))
|
||||||
|
for item in val:
|
||||||
|
kv_data += self._pack_val(item, ltype, add_vtype=False)
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid GGUF metadata value type or value")
|
||||||
|
|
||||||
|
return kv_data
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def format_n_bytes_to_str(num: int) -> str:
|
||||||
|
if num == 0:
|
||||||
|
return "negligible - metadata only"
|
||||||
|
fnum = float(num)
|
||||||
|
for unit in ("", "K", "M", "G"):
|
||||||
|
if abs(fnum) < 1000.0:
|
||||||
|
return f"{fnum:3.1f}{unit}"
|
||||||
|
fnum /= 1000.0
|
||||||
|
return f"{fnum:.1f}T - over 1TB, split recommended"
|
|
@ -0,0 +1,211 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
from abc import ABC, ABCMeta, abstractmethod
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Any, Callable
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from numpy.typing import DTypeLike
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class LazyMeta(ABCMeta):
|
||||||
|
|
||||||
|
def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs):
|
||||||
|
def __getattr__(self, name: str) -> Any:
|
||||||
|
meta_attr = getattr(self._meta, name)
|
||||||
|
if callable(meta_attr):
|
||||||
|
return type(self)._wrap_fn(
|
||||||
|
(lambda s, *args, **kwargs: getattr(s, name)(*args, **kwargs)),
|
||||||
|
use_self=self,
|
||||||
|
)
|
||||||
|
elif isinstance(meta_attr, self._tensor_type):
|
||||||
|
# e.g. self.T with torch.Tensor should still be wrapped
|
||||||
|
return type(self)._wrap_fn(lambda s: getattr(s, name))(self)
|
||||||
|
else:
|
||||||
|
# no need to wrap non-tensor properties,
|
||||||
|
# and they likely don't depend on the actual contents of the tensor
|
||||||
|
return meta_attr
|
||||||
|
|
||||||
|
namespace["__getattr__"] = __getattr__
|
||||||
|
|
||||||
|
# need to make a builder for the wrapped wrapper to copy the name,
|
||||||
|
# or else it fails with very cryptic error messages,
|
||||||
|
# because somehow the same string would end up in every closures
|
||||||
|
def mk_wrap(op_name: str, *, meta_noop: bool = False):
|
||||||
|
# need to wrap the wrapper to get self
|
||||||
|
def wrapped_special_op(self, *args, **kwargs):
|
||||||
|
return type(self)._wrap_fn(
|
||||||
|
getattr(type(self)._tensor_type, op_name),
|
||||||
|
meta_noop=meta_noop,
|
||||||
|
)(self, *args, **kwargs)
|
||||||
|
return wrapped_special_op
|
||||||
|
|
||||||
|
# special methods bypass __getattr__, so they need to be added manually
|
||||||
|
# ref: https://docs.python.org/3/reference/datamodel.html#special-lookup
|
||||||
|
# NOTE: doing this from a metaclass is very convenient
|
||||||
|
# TODO: make this even more comprehensive
|
||||||
|
for binary_op in (
|
||||||
|
"lt", "le", "eq", "ne", "ge", "gt", "not"
|
||||||
|
"abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul",
|
||||||
|
"neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor",
|
||||||
|
"iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor",
|
||||||
|
"radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor",
|
||||||
|
):
|
||||||
|
attr_name = f"__{binary_op}__"
|
||||||
|
# the result of these operators usually has the same shape and dtype as the input,
|
||||||
|
# so evaluation on the meta tensor can be skipped.
|
||||||
|
namespace[attr_name] = mk_wrap(attr_name, meta_noop=True)
|
||||||
|
|
||||||
|
for special_op in (
|
||||||
|
"getitem", "setitem", "len",
|
||||||
|
):
|
||||||
|
attr_name = f"__{special_op}__"
|
||||||
|
namespace[attr_name] = mk_wrap(attr_name, meta_noop=False)
|
||||||
|
|
||||||
|
return super().__new__(cls, name, bases, namespace, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
# Tree of lazy tensors
|
||||||
|
class LazyBase(ABC, metaclass=LazyMeta):
|
||||||
|
_tensor_type: type
|
||||||
|
_meta: Any
|
||||||
|
_data: Any | None
|
||||||
|
_args: tuple
|
||||||
|
_kwargs: dict[str, Any]
|
||||||
|
_func: Callable[[Any], Any] | None
|
||||||
|
|
||||||
|
def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None):
|
||||||
|
super().__init__()
|
||||||
|
self._meta = meta
|
||||||
|
self._data = data
|
||||||
|
self._args = args
|
||||||
|
self._kwargs = kwargs if kwargs is not None else {}
|
||||||
|
self._func = func
|
||||||
|
assert self._func is not None or self._data is not None
|
||||||
|
|
||||||
|
def __init_subclass__(cls) -> None:
|
||||||
|
if "_tensor_type" not in cls.__dict__:
|
||||||
|
raise TypeError(f"property '_tensor_type' must be defined for {cls!r}")
|
||||||
|
return super().__init_subclass__()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
|
||||||
|
# TODO: dict and set
|
||||||
|
if isinstance(o, (list, tuple)):
|
||||||
|
L = []
|
||||||
|
for item in o:
|
||||||
|
L.append(LazyBase._recurse_apply(item, fn))
|
||||||
|
if isinstance(o, tuple):
|
||||||
|
L = tuple(L)
|
||||||
|
return L
|
||||||
|
elif isinstance(o, LazyBase):
|
||||||
|
return fn(o)
|
||||||
|
else:
|
||||||
|
return o
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
|
||||||
|
def wrapped_fn(*args, **kwargs):
|
||||||
|
if kwargs is None:
|
||||||
|
kwargs = {}
|
||||||
|
args = ((use_self,) if use_self is not None else ()) + args
|
||||||
|
|
||||||
|
meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
|
||||||
|
# TODO: maybe handle tensors in kwargs too
|
||||||
|
|
||||||
|
if isinstance(meta_noop, bool) and not meta_noop:
|
||||||
|
try:
|
||||||
|
res = fn(*meta_args, **kwargs)
|
||||||
|
except NotImplementedError:
|
||||||
|
# running some operations on PyTorch's Meta tensors can cause this exception
|
||||||
|
res = None
|
||||||
|
else:
|
||||||
|
# some operators don't need to actually run on the meta tensors
|
||||||
|
assert len(args) > 0
|
||||||
|
res = args[0]
|
||||||
|
assert isinstance(res, cls)
|
||||||
|
res = res._meta
|
||||||
|
# allow operations to override the dtype and shape
|
||||||
|
if meta_noop is not True:
|
||||||
|
if isinstance(meta_noop, tuple):
|
||||||
|
dtype, shape = meta_noop
|
||||||
|
assert callable(shape)
|
||||||
|
res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))
|
||||||
|
else:
|
||||||
|
res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
|
||||||
|
|
||||||
|
if isinstance(res, cls._tensor_type):
|
||||||
|
return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn)
|
||||||
|
else:
|
||||||
|
del res # not needed
|
||||||
|
# non-tensor return likely relies on the contents of the args
|
||||||
|
# (e.g. the result of torch.equal)
|
||||||
|
eager_args = cls.to_eager(args)
|
||||||
|
return fn(*eager_args, **kwargs)
|
||||||
|
return wrapped_fn
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def to_eager(cls, t: Any) -> Any:
|
||||||
|
def simple_to_eager(_t: LazyBase) -> Any:
|
||||||
|
if _t._data is not None:
|
||||||
|
return _t._data
|
||||||
|
|
||||||
|
# NOTE: there's a recursion limit in Python (usually 1000)
|
||||||
|
|
||||||
|
assert _t._func is not None
|
||||||
|
_t._args = cls._recurse_apply(_t._args, simple_to_eager)
|
||||||
|
_t._data = _t._func(*_t._args, **_t._kwargs)
|
||||||
|
# sanity check
|
||||||
|
assert _t._data is not None
|
||||||
|
assert _t._data.dtype == _t._meta.dtype
|
||||||
|
assert _t._data.shape == _t._meta.shape
|
||||||
|
|
||||||
|
return _t._data
|
||||||
|
|
||||||
|
# recurse into lists and/or tuples, keeping their structure
|
||||||
|
return cls._recurse_apply(t, simple_to_eager)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def eager_to_meta(cls, t: Any) -> Any:
|
||||||
|
return cls.meta_with_dtype_and_shape(t.dtype, t.shape)
|
||||||
|
|
||||||
|
# must be overridden, meta tensor init is backend-specific
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_eager(cls, t: Any) -> Any:
|
||||||
|
if type(t) is cls:
|
||||||
|
# already lazy
|
||||||
|
return t
|
||||||
|
elif isinstance(t, cls._tensor_type):
|
||||||
|
return cls(meta=cls.eager_to_meta(t), data=t)
|
||||||
|
else:
|
||||||
|
return TypeError(f"{type(t)!r} is not compatible with {cls._tensor_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
class LazyNumpyTensor(LazyBase):
|
||||||
|
_tensor_type = np.ndarray
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
|
||||||
|
# The initial idea was to use np.nan as the fill value,
|
||||||
|
# but non-float types like np.int16 can't use that.
|
||||||
|
# So zero it is.
|
||||||
|
cheat = np.zeros(1, dtype)
|
||||||
|
return np.lib.stride_tricks.as_strided(cheat, shape, (0 for _ in shape))
|
||||||
|
|
||||||
|
def astype(self, dtype, *args, **kwargs):
|
||||||
|
meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
|
||||||
|
full_args = (self, dtype,) + args
|
||||||
|
return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)))
|
||||||
|
|
||||||
|
def tofile(self, *args, **kwargs):
|
||||||
|
eager = LazyNumpyTensor.to_eager(self)
|
||||||
|
return eager.tofile(*args, **kwargs)
|
||||||
|
|
||||||
|
# TODO: __array_function__
|
|
@ -0,0 +1,503 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import yaml
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Literal, Optional
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from .constants import Keys
|
||||||
|
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
logger = logging.getLogger("metadata")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Metadata:
|
||||||
|
# Authorship Metadata to be written to GGUF KV Store
|
||||||
|
name: Optional[str] = None
|
||||||
|
author: Optional[str] = None
|
||||||
|
version: Optional[str] = None
|
||||||
|
organization: Optional[str] = None
|
||||||
|
finetune: Optional[str] = None
|
||||||
|
basename: Optional[str] = None
|
||||||
|
description: Optional[str] = None
|
||||||
|
quantized_by: Optional[str] = None
|
||||||
|
size_label: Optional[str] = None
|
||||||
|
url: Optional[str] = None
|
||||||
|
doi: Optional[str] = None
|
||||||
|
uuid: Optional[str] = None
|
||||||
|
repo_url: Optional[str] = None
|
||||||
|
source_url: Optional[str] = None
|
||||||
|
source_doi: Optional[str] = None
|
||||||
|
source_uuid: Optional[str] = None
|
||||||
|
source_repo_url: Optional[str] = None
|
||||||
|
license: Optional[str] = None
|
||||||
|
license_name: Optional[str] = None
|
||||||
|
license_link: Optional[str] = None
|
||||||
|
base_models: Optional[list[dict]] = None
|
||||||
|
tags: Optional[list[str]] = None
|
||||||
|
languages: Optional[list[str]] = None
|
||||||
|
datasets: Optional[list[str]] = None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Path] = None, model_name: Optional[str] = None, total_params: int = 0) -> Metadata:
|
||||||
|
# This grabs as many contextual authorship metadata as possible from the model repository
|
||||||
|
# making any conversion as required to match the gguf kv store metadata format
|
||||||
|
# as well as giving users the ability to override any authorship metadata that may be incorrect
|
||||||
|
|
||||||
|
# Create a new Metadata instance
|
||||||
|
metadata = Metadata()
|
||||||
|
|
||||||
|
model_card = Metadata.load_model_card(model_path)
|
||||||
|
hf_params = Metadata.load_hf_parameters(model_path)
|
||||||
|
# TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
|
||||||
|
|
||||||
|
# heuristics
|
||||||
|
metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
|
||||||
|
|
||||||
|
# Metadata Override File Provided
|
||||||
|
# This is based on LLM_KV_NAMES mapping in llama.cpp
|
||||||
|
metadata_override = Metadata.load_metadata_override(metadata_override_path)
|
||||||
|
|
||||||
|
metadata.name = metadata_override.get(Keys.General.NAME, metadata.name)
|
||||||
|
metadata.author = metadata_override.get(Keys.General.AUTHOR, metadata.author)
|
||||||
|
metadata.version = metadata_override.get(Keys.General.VERSION, metadata.version)
|
||||||
|
metadata.organization = metadata_override.get(Keys.General.ORGANIZATION, metadata.organization)
|
||||||
|
|
||||||
|
metadata.finetune = metadata_override.get(Keys.General.FINETUNE, metadata.finetune)
|
||||||
|
metadata.basename = metadata_override.get(Keys.General.BASENAME, metadata.basename)
|
||||||
|
|
||||||
|
metadata.description = metadata_override.get(Keys.General.DESCRIPTION, metadata.description)
|
||||||
|
metadata.quantized_by = metadata_override.get(Keys.General.QUANTIZED_BY, metadata.quantized_by)
|
||||||
|
|
||||||
|
metadata.size_label = metadata_override.get(Keys.General.SIZE_LABEL, metadata.size_label)
|
||||||
|
metadata.license_name = metadata_override.get(Keys.General.LICENSE_NAME, metadata.license_name)
|
||||||
|
metadata.license_link = metadata_override.get(Keys.General.LICENSE_LINK, metadata.license_link)
|
||||||
|
|
||||||
|
metadata.url = metadata_override.get(Keys.General.URL, metadata.url)
|
||||||
|
metadata.doi = metadata_override.get(Keys.General.DOI, metadata.doi)
|
||||||
|
metadata.uuid = metadata_override.get(Keys.General.UUID, metadata.uuid)
|
||||||
|
metadata.repo_url = metadata_override.get(Keys.General.REPO_URL, metadata.repo_url)
|
||||||
|
|
||||||
|
metadata.source_url = metadata_override.get(Keys.General.SOURCE_URL, metadata.source_url)
|
||||||
|
metadata.source_doi = metadata_override.get(Keys.General.SOURCE_DOI, metadata.source_doi)
|
||||||
|
metadata.source_uuid = metadata_override.get(Keys.General.SOURCE_UUID, metadata.source_uuid)
|
||||||
|
metadata.source_repo_url = metadata_override.get(Keys.General.SOURCE_REPO_URL, metadata.source_repo_url)
|
||||||
|
|
||||||
|
# Base Models is received here as an array of models
|
||||||
|
metadata.base_models = metadata_override.get("general.base_models", metadata.base_models)
|
||||||
|
|
||||||
|
metadata.tags = metadata_override.get(Keys.General.TAGS, metadata.tags)
|
||||||
|
metadata.languages = metadata_override.get(Keys.General.LANGUAGES, metadata.languages)
|
||||||
|
metadata.datasets = metadata_override.get(Keys.General.DATASETS, metadata.datasets)
|
||||||
|
|
||||||
|
# Direct Metadata Override (via direct cli argument)
|
||||||
|
if model_name is not None:
|
||||||
|
metadata.name = model_name
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_metadata_override(metadata_override_path: Optional[Path] = None) -> dict[str, Any]:
|
||||||
|
if metadata_override_path is None or not metadata_override_path.is_file():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
with open(metadata_override_path, "r", encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_model_card(model_path: Optional[Path] = None) -> dict[str, Any]:
|
||||||
|
if model_path is None or not model_path.is_dir():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
model_card_path = model_path / "README.md"
|
||||||
|
|
||||||
|
if not model_card_path.is_file():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# The model card metadata is assumed to always be in YAML
|
||||||
|
# ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473
|
||||||
|
with open(model_card_path, "r", encoding="utf-8") as f:
|
||||||
|
if f.readline() == "---\n":
|
||||||
|
raw = f.read().partition("---\n")[0]
|
||||||
|
data = yaml.safe_load(raw)
|
||||||
|
if isinstance(data, dict):
|
||||||
|
return data
|
||||||
|
else:
|
||||||
|
logger.error(f"while reading YAML model card frontmatter, data is {type(data)} instead of dict")
|
||||||
|
return {}
|
||||||
|
else:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]:
|
||||||
|
if model_path is None or not model_path.is_dir():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
config_path = model_path / "config.json"
|
||||||
|
|
||||||
|
if not config_path.is_file():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
with open(config_path, "r", encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def id_to_title(string):
|
||||||
|
# Convert capitalization into title form unless acronym or version number
|
||||||
|
return ' '.join([w.title() if w.islower() and not re.match(r'^(v\d+(?:\.\d+)*|\d.*)$', w) else w for w in string.strip().replace('-', ' ').split()])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_model_id_components(model_id: Optional[str] = None, total_params: int = 0) -> tuple[str | None, str | None, str | None, str | None, str | None, str | None]:
|
||||||
|
# Huggingface often store model id as '<org>/<model name>'
|
||||||
|
# so let's parse it and apply some heuristics if possible for model name components
|
||||||
|
|
||||||
|
if model_id is None:
|
||||||
|
# model ID missing
|
||||||
|
return None, None, None, None, None, None
|
||||||
|
|
||||||
|
if ' ' in model_id:
|
||||||
|
# model ID is actually a normal human sentence
|
||||||
|
# which means its most likely a normal model name only
|
||||||
|
# not part of the hugging face naming standard, but whatever
|
||||||
|
return model_id, None, None, None, None, None
|
||||||
|
|
||||||
|
if '/' in model_id:
|
||||||
|
# model ID (huggingface style)
|
||||||
|
org_component, model_full_name_component = model_id.split('/', 1)
|
||||||
|
else:
|
||||||
|
# model ID but missing org components
|
||||||
|
org_component, model_full_name_component = None, model_id
|
||||||
|
|
||||||
|
# Check if we erroneously matched against './' or '../' etc...
|
||||||
|
if org_component is not None and org_component[0] == '.':
|
||||||
|
org_component = None
|
||||||
|
|
||||||
|
name_parts: list[str] = model_full_name_component.split('-')
|
||||||
|
|
||||||
|
# Remove empty parts
|
||||||
|
for i in reversed(range(len(name_parts))):
|
||||||
|
if len(name_parts[i]) == 0:
|
||||||
|
del name_parts[i]
|
||||||
|
|
||||||
|
name_types: list[
|
||||||
|
set[Literal["basename", "size_label", "finetune", "version", "type"]]
|
||||||
|
] = [set() for _ in name_parts]
|
||||||
|
|
||||||
|
# Annotate the name
|
||||||
|
for i, part in enumerate(name_parts):
|
||||||
|
# Version
|
||||||
|
if re.fullmatch(r'(v|iter)?\d+([.]\d+)*', part, re.IGNORECASE):
|
||||||
|
name_types[i].add("version")
|
||||||
|
# Quant type (should not be there for base models, but still annotated)
|
||||||
|
elif re.fullmatch(r'i?q\d(_\w)*|b?fp?(16|32)', part, re.IGNORECASE):
|
||||||
|
name_types[i].add("type")
|
||||||
|
name_parts[i] = part.upper()
|
||||||
|
# Model size
|
||||||
|
elif i > 0 and re.fullmatch(r'(([A]|\d+[x])?\d+([._]\d+)?[KMBT][\d]?|small|mini|medium|large|x?xl)', part, re.IGNORECASE):
|
||||||
|
part = part.replace("_", ".")
|
||||||
|
# Handle weird bloom-7b1 notation
|
||||||
|
if part[-1].isdecimal():
|
||||||
|
part = part[:-2] + "." + part[-1] + part[-2]
|
||||||
|
# Normalize the size suffixes
|
||||||
|
if len(part) > 1 and part[-2].isdecimal():
|
||||||
|
if part[-1] in "kmbt":
|
||||||
|
part = part[:-1] + part[-1].upper()
|
||||||
|
if total_params != 0:
|
||||||
|
try:
|
||||||
|
label_params = float(part[:-1]) * pow(1000, " KMBT".find(part[-1]))
|
||||||
|
# Only use it as a size label if it's close or bigger than the model size
|
||||||
|
# Note that LoRA adapters don't necessarily include all layers,
|
||||||
|
# so this is why bigger label sizes are accepted.
|
||||||
|
# Do not use the size label when it's smaller than 1/8 of the model size
|
||||||
|
if (total_params < 0 and label_params < abs(total_params) // 8) or (
|
||||||
|
# Check both directions when the current model isn't a LoRA adapter
|
||||||
|
total_params > 0 and abs(label_params - total_params) > 7 * total_params // 8
|
||||||
|
):
|
||||||
|
# Likely a context length
|
||||||
|
name_types[i].add("finetune")
|
||||||
|
# Lowercase the size when it's a context length
|
||||||
|
part = part[:-1] + part[-1].lower()
|
||||||
|
except ValueError:
|
||||||
|
# Failed to convert the size label to float, use it anyway
|
||||||
|
pass
|
||||||
|
if len(name_types[i]) == 0:
|
||||||
|
name_types[i].add("size_label")
|
||||||
|
name_parts[i] = part
|
||||||
|
# Some easy to recognize finetune names
|
||||||
|
elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE):
|
||||||
|
if total_params < 0 and part.lower() == "lora":
|
||||||
|
# ignore redundant "lora" in the finetune part when the output is a lora adapter
|
||||||
|
name_types[i].add("type")
|
||||||
|
else:
|
||||||
|
name_types[i].add("finetune")
|
||||||
|
|
||||||
|
# Ignore word-based size labels when there is at least a number-based one present
|
||||||
|
# TODO: should word-based size labels always be removed instead?
|
||||||
|
if any(c.isdecimal() for n, t in zip(name_parts, name_types) if "size_label" in t for c in n):
|
||||||
|
for n, t in zip(name_parts, name_types):
|
||||||
|
if "size_label" in t:
|
||||||
|
if all(c.isalpha() for c in n):
|
||||||
|
t.remove("size_label")
|
||||||
|
|
||||||
|
at_start = True
|
||||||
|
# Find the basename through the annotated name
|
||||||
|
for part, t in zip(name_parts, name_types):
|
||||||
|
if at_start and ((len(t) == 0 and part[0].isalpha()) or "version" in t):
|
||||||
|
t.add("basename")
|
||||||
|
else:
|
||||||
|
if at_start:
|
||||||
|
at_start = False
|
||||||
|
if len(t) == 0:
|
||||||
|
t.add("finetune")
|
||||||
|
|
||||||
|
# Remove the basename annotation from trailing version
|
||||||
|
for part, t in zip(reversed(name_parts), reversed(name_types)):
|
||||||
|
if "basename" in t and len(t) > 1:
|
||||||
|
t.remove("basename")
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None
|
||||||
|
# Deduplicate size labels using order-preserving 'dict' ('set' seems to sort the keys)
|
||||||
|
size_label = "-".join(dict.fromkeys(s for s, t in zip(name_parts, name_types) if "size_label" in t).keys()) or None
|
||||||
|
finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None
|
||||||
|
# TODO: should the basename version always be excluded?
|
||||||
|
# NOTE: multiple finetune versions are joined together
|
||||||
|
version = "-".join(v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t) or None
|
||||||
|
|
||||||
|
if size_label is None and finetune is None and version is None:
|
||||||
|
# Too ambiguous, output nothing
|
||||||
|
basename = None
|
||||||
|
|
||||||
|
return model_full_name_component, org_component, basename, finetune, version, size_label
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def apply_metadata_heuristic(metadata: Metadata, model_card: Optional[dict] = None, hf_params: Optional[dict] = None, model_path: Optional[Path] = None, total_params: int = 0) -> Metadata:
|
||||||
|
# Reference Model Card Metadata: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
|
||||||
|
|
||||||
|
# Model Card Heuristics
|
||||||
|
########################
|
||||||
|
if model_card is not None:
|
||||||
|
|
||||||
|
if "model_name" in model_card and metadata.name is None:
|
||||||
|
# Not part of huggingface model card standard but notice some model creator using it
|
||||||
|
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
||||||
|
metadata.name = model_card.get("model_name")
|
||||||
|
|
||||||
|
if "model_creator" in model_card and metadata.author is None:
|
||||||
|
# Not part of huggingface model card standard but notice some model creator using it
|
||||||
|
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
||||||
|
metadata.author = model_card.get("model_creator")
|
||||||
|
|
||||||
|
if "model_type" in model_card and metadata.basename is None:
|
||||||
|
# Not part of huggingface model card standard but notice some model creator using it
|
||||||
|
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
|
||||||
|
metadata.basename = model_card.get("model_type")
|
||||||
|
|
||||||
|
if "base_model" in model_card:
|
||||||
|
# This represents the parent models that this is based on
|
||||||
|
# Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges)
|
||||||
|
# Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md
|
||||||
|
metadata_base_models = []
|
||||||
|
base_model_value = model_card.get("base_model", None)
|
||||||
|
|
||||||
|
if base_model_value is not None:
|
||||||
|
if isinstance(base_model_value, str):
|
||||||
|
metadata_base_models.append(base_model_value)
|
||||||
|
elif isinstance(base_model_value, list):
|
||||||
|
metadata_base_models.extend(base_model_value)
|
||||||
|
|
||||||
|
if metadata.base_models is None:
|
||||||
|
metadata.base_models = []
|
||||||
|
|
||||||
|
for model_id in metadata_base_models:
|
||||||
|
# NOTE: model size of base model is assumed to be similar to the size of the current model
|
||||||
|
model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
|
||||||
|
base_model = {}
|
||||||
|
if model_full_name_component is not None:
|
||||||
|
base_model["name"] = Metadata.id_to_title(model_full_name_component)
|
||||||
|
if org_component is not None:
|
||||||
|
base_model["organization"] = Metadata.id_to_title(org_component)
|
||||||
|
if version is not None:
|
||||||
|
base_model["version"] = version
|
||||||
|
if org_component is not None and model_full_name_component is not None:
|
||||||
|
base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}"
|
||||||
|
metadata.base_models.append(base_model)
|
||||||
|
|
||||||
|
if "license" in model_card and metadata.license is None:
|
||||||
|
metadata.license = model_card.get("license")
|
||||||
|
|
||||||
|
if "license_name" in model_card and metadata.license_name is None:
|
||||||
|
metadata.license_name = model_card.get("license_name")
|
||||||
|
|
||||||
|
if "license_link" in model_card and metadata.license_link is None:
|
||||||
|
metadata.license_link = model_card.get("license_link")
|
||||||
|
|
||||||
|
tags_value = model_card.get("tags", None)
|
||||||
|
if tags_value is not None:
|
||||||
|
|
||||||
|
if metadata.tags is None:
|
||||||
|
metadata.tags = []
|
||||||
|
|
||||||
|
if isinstance(tags_value, str):
|
||||||
|
metadata.tags.append(tags_value)
|
||||||
|
elif isinstance(tags_value, list):
|
||||||
|
metadata.tags.extend(tags_value)
|
||||||
|
|
||||||
|
pipeline_tags_value = model_card.get("pipeline_tag", None)
|
||||||
|
if pipeline_tags_value is not None:
|
||||||
|
|
||||||
|
if metadata.tags is None:
|
||||||
|
metadata.tags = []
|
||||||
|
|
||||||
|
if isinstance(pipeline_tags_value, str):
|
||||||
|
metadata.tags.append(pipeline_tags_value)
|
||||||
|
elif isinstance(pipeline_tags_value, list):
|
||||||
|
metadata.tags.extend(pipeline_tags_value)
|
||||||
|
|
||||||
|
language_value = model_card.get("languages", model_card.get("language", None))
|
||||||
|
if language_value is not None:
|
||||||
|
|
||||||
|
if metadata.languages is None:
|
||||||
|
metadata.languages = []
|
||||||
|
|
||||||
|
if isinstance(language_value, str):
|
||||||
|
metadata.languages.append(language_value)
|
||||||
|
elif isinstance(language_value, list):
|
||||||
|
metadata.languages.extend(language_value)
|
||||||
|
|
||||||
|
dataset_value = model_card.get("datasets", model_card.get("dataset", None))
|
||||||
|
if dataset_value is not None:
|
||||||
|
|
||||||
|
if metadata.datasets is None:
|
||||||
|
metadata.datasets = []
|
||||||
|
|
||||||
|
if isinstance(dataset_value, str):
|
||||||
|
metadata.datasets.append(dataset_value)
|
||||||
|
elif isinstance(dataset_value, list):
|
||||||
|
metadata.datasets.extend(dataset_value)
|
||||||
|
|
||||||
|
# Hugging Face Parameter Heuristics
|
||||||
|
####################################
|
||||||
|
|
||||||
|
if hf_params is not None:
|
||||||
|
|
||||||
|
hf_name_or_path = hf_params.get("_name_or_path")
|
||||||
|
if hf_name_or_path is not None and hf_name_or_path.count('/') <= 1:
|
||||||
|
# Use _name_or_path only if its actually a model name and not some computer path
|
||||||
|
# e.g. 'meta-llama/Llama-2-7b-hf'
|
||||||
|
model_id = hf_name_or_path
|
||||||
|
model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
|
||||||
|
if metadata.name is None and model_full_name_component is not None:
|
||||||
|
metadata.name = Metadata.id_to_title(model_full_name_component)
|
||||||
|
if metadata.organization is None and org_component is not None:
|
||||||
|
metadata.organization = Metadata.id_to_title(org_component)
|
||||||
|
if metadata.basename is None and basename is not None:
|
||||||
|
metadata.basename = basename
|
||||||
|
if metadata.finetune is None and finetune is not None:
|
||||||
|
metadata.finetune = finetune
|
||||||
|
if metadata.version is None and version is not None:
|
||||||
|
metadata.version = version
|
||||||
|
if metadata.size_label is None and size_label is not None:
|
||||||
|
metadata.size_label = size_label
|
||||||
|
|
||||||
|
# Directory Folder Name Fallback Heuristics
|
||||||
|
############################################
|
||||||
|
if model_path is not None:
|
||||||
|
model_id = model_path.name
|
||||||
|
model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
|
||||||
|
if metadata.name is None and model_full_name_component is not None:
|
||||||
|
metadata.name = Metadata.id_to_title(model_full_name_component)
|
||||||
|
if metadata.organization is None and org_component is not None:
|
||||||
|
metadata.organization = Metadata.id_to_title(org_component)
|
||||||
|
if metadata.basename is None and basename is not None:
|
||||||
|
metadata.basename = basename
|
||||||
|
if metadata.finetune is None and finetune is not None:
|
||||||
|
metadata.finetune = finetune
|
||||||
|
if metadata.version is None and version is not None:
|
||||||
|
metadata.version = version
|
||||||
|
if metadata.size_label is None and size_label is not None:
|
||||||
|
metadata.size_label = size_label
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
|
||||||
|
assert self.name is not None
|
||||||
|
gguf_writer.add_name(self.name)
|
||||||
|
|
||||||
|
if self.author is not None:
|
||||||
|
gguf_writer.add_author(self.author)
|
||||||
|
if self.version is not None:
|
||||||
|
gguf_writer.add_version(self.version)
|
||||||
|
if self.organization is not None:
|
||||||
|
gguf_writer.add_organization(self.organization)
|
||||||
|
|
||||||
|
if self.finetune is not None:
|
||||||
|
gguf_writer.add_finetune(self.finetune)
|
||||||
|
if self.basename is not None:
|
||||||
|
gguf_writer.add_basename(self.basename)
|
||||||
|
|
||||||
|
if self.description is not None:
|
||||||
|
gguf_writer.add_description(self.description)
|
||||||
|
if self.quantized_by is not None:
|
||||||
|
gguf_writer.add_quantized_by(self.quantized_by)
|
||||||
|
|
||||||
|
if self.size_label is not None:
|
||||||
|
gguf_writer.add_size_label(self.size_label)
|
||||||
|
|
||||||
|
if self.license is not None:
|
||||||
|
gguf_writer.add_license(self.license)
|
||||||
|
if self.license_name is not None:
|
||||||
|
gguf_writer.add_license_name(self.license_name)
|
||||||
|
if self.license_link is not None:
|
||||||
|
gguf_writer.add_license_link(self.license_link)
|
||||||
|
|
||||||
|
if self.url is not None:
|
||||||
|
gguf_writer.add_url(self.url)
|
||||||
|
if self.doi is not None:
|
||||||
|
gguf_writer.add_doi(self.doi)
|
||||||
|
if self.uuid is not None:
|
||||||
|
gguf_writer.add_uuid(self.uuid)
|
||||||
|
if self.repo_url is not None:
|
||||||
|
gguf_writer.add_repo_url(self.repo_url)
|
||||||
|
|
||||||
|
if self.source_url is not None:
|
||||||
|
gguf_writer.add_source_url(self.source_url)
|
||||||
|
if self.source_doi is not None:
|
||||||
|
gguf_writer.add_source_doi(self.source_doi)
|
||||||
|
if self.source_uuid is not None:
|
||||||
|
gguf_writer.add_source_uuid(self.source_uuid)
|
||||||
|
if self.source_repo_url is not None:
|
||||||
|
gguf_writer.add_source_repo_url(self.source_repo_url)
|
||||||
|
|
||||||
|
if self.base_models is not None:
|
||||||
|
gguf_writer.add_base_model_count(len(self.base_models))
|
||||||
|
for key, base_model_entry in enumerate(self.base_models):
|
||||||
|
if "name" in base_model_entry:
|
||||||
|
gguf_writer.add_base_model_name(key, base_model_entry["name"])
|
||||||
|
if "author" in base_model_entry:
|
||||||
|
gguf_writer.add_base_model_author(key, base_model_entry["author"])
|
||||||
|
if "version" in base_model_entry:
|
||||||
|
gguf_writer.add_base_model_version(key, base_model_entry["version"])
|
||||||
|
if "organization" in base_model_entry:
|
||||||
|
gguf_writer.add_base_model_organization(key, base_model_entry["organization"])
|
||||||
|
if "url" in base_model_entry:
|
||||||
|
gguf_writer.add_base_model_url(key, base_model_entry["url"])
|
||||||
|
if "doi" in base_model_entry:
|
||||||
|
gguf_writer.add_base_model_doi(key, base_model_entry["doi"])
|
||||||
|
if "uuid" in base_model_entry:
|
||||||
|
gguf_writer.add_base_model_uuid(key, base_model_entry["uuid"])
|
||||||
|
if "repo_url" in base_model_entry:
|
||||||
|
gguf_writer.add_base_model_repo_url(key, base_model_entry["repo_url"])
|
||||||
|
|
||||||
|
if self.tags is not None:
|
||||||
|
gguf_writer.add_tags(self.tags)
|
||||||
|
if self.languages is not None:
|
||||||
|
gguf_writer.add_languages(self.languages)
|
||||||
|
if self.datasets is not None:
|
||||||
|
gguf_writer.add_datasets(self.datasets)
|
|
@ -0,0 +1,121 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
from typing import Callable, Sequence
|
||||||
|
|
||||||
|
from numpy.typing import DTypeLike
|
||||||
|
|
||||||
|
from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
|
||||||
|
from .lazy import LazyNumpyTensor
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||||
|
if shape[-1] % block_size != 0:
|
||||||
|
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
|
||||||
|
return (*shape[:-1], shape[-1] // block_size * type_size)
|
||||||
|
|
||||||
|
|
||||||
|
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||||
|
if shape[-1] % type_size != 0:
|
||||||
|
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
|
||||||
|
return (*shape[:-1], shape[-1] // type_size * block_size)
|
||||||
|
|
||||||
|
|
||||||
|
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||||
|
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
||||||
|
n = n.astype(np.float32, copy=False).view(np.uint32)
|
||||||
|
# force nan to quiet
|
||||||
|
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
|
||||||
|
# round to nearest even
|
||||||
|
n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
|
||||||
|
return n.astype(np.uint16)
|
||||||
|
|
||||||
|
|
||||||
|
# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
|
||||||
|
def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
|
||||||
|
rows = arr.reshape((-1, arr.shape[-1]))
|
||||||
|
osize = 1
|
||||||
|
for dim in oshape:
|
||||||
|
osize *= dim
|
||||||
|
out = np.empty(shape=osize, dtype=otype)
|
||||||
|
# compute over groups of 16 rows (arbitrary, but seems good for performance)
|
||||||
|
n_groups = (rows.shape[0] // 16) or 1
|
||||||
|
np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
|
||||||
|
return out.reshape(oshape)
|
||||||
|
|
||||||
|
|
||||||
|
def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
|
||||||
|
return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape)
|
||||||
|
|
||||||
|
|
||||||
|
__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16)
|
||||||
|
|
||||||
|
|
||||||
|
def quantize_bf16(n: np.ndarray):
|
||||||
|
if type(n) is LazyNumpyTensor:
|
||||||
|
return __quantize_bf16_lazy(n)
|
||||||
|
else:
|
||||||
|
return __quantize_bf16_array(n)
|
||||||
|
|
||||||
|
|
||||||
|
__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
|
||||||
|
|
||||||
|
|
||||||
|
def can_quantize_to_q8_0(n: np.ndarray) -> bool:
|
||||||
|
return n.shape[-1] % __q8_block_size == 0
|
||||||
|
|
||||||
|
|
||||||
|
# round away from zero
|
||||||
|
# ref: https://stackoverflow.com/a/59143326/22827863
|
||||||
|
def np_roundf(n: np.ndarray) -> np.ndarray:
|
||||||
|
a = abs(n)
|
||||||
|
floored = np.floor(a)
|
||||||
|
b = floored + np.floor(2 * (a - floored))
|
||||||
|
return np.sign(n) * b
|
||||||
|
|
||||||
|
|
||||||
|
def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
|
||||||
|
return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
|
||||||
|
|
||||||
|
|
||||||
|
# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
|
||||||
|
def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
|
||||||
|
shape = n.shape
|
||||||
|
assert shape[-1] % __q8_block_size == 0
|
||||||
|
|
||||||
|
n_blocks = n.size // __q8_block_size
|
||||||
|
|
||||||
|
blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
|
||||||
|
|
||||||
|
d = abs(blocks).max(axis=1, keepdims=True) / 127
|
||||||
|
with np.errstate(divide="ignore"):
|
||||||
|
id = np.where(d == 0, 0, 1 / d)
|
||||||
|
qs = np_roundf(blocks * id)
|
||||||
|
|
||||||
|
# (n_blocks, 2)
|
||||||
|
d = d.astype(np.float16).view(np.uint8)
|
||||||
|
# (n_blocks, block_size)
|
||||||
|
qs = qs.astype(np.int8).view(np.uint8)
|
||||||
|
|
||||||
|
assert d.shape[1] + qs.shape[1] == __q8_type_size
|
||||||
|
|
||||||
|
return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
|
||||||
|
|
||||||
|
|
||||||
|
def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
|
||||||
|
return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
|
||||||
|
|
||||||
|
|
||||||
|
__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
|
||||||
|
__quantize_q8_0_array,
|
||||||
|
meta_noop=(np.uint8, __quantize_q8_0_shape_change),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def quantize_q8_0(data: np.ndarray):
|
||||||
|
if type(data) is LazyNumpyTensor:
|
||||||
|
return __quantize_q8_0_lazy(data)
|
||||||
|
else:
|
||||||
|
return __quantize_q8_0_array(data)
|
|
@ -0,0 +1,649 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Sequence
|
||||||
|
|
||||||
|
from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES
|
||||||
|
|
||||||
|
|
||||||
|
class TensorNameMap:
|
||||||
|
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||||
|
# Token embeddings
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD: (
|
||||||
|
"gpt_neox.embed_in", # gptneox
|
||||||
|
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais
|
||||||
|
"transformer.word_embeddings", # falcon
|
||||||
|
"word_embeddings", # bloom
|
||||||
|
"model.embed_tokens", # llama-hf
|
||||||
|
"tok_embeddings", # llama-pth
|
||||||
|
"embeddings.word_embeddings", # bert nomic-bert
|
||||||
|
"language_model.embedding.word_embeddings", # persimmon
|
||||||
|
"wte", # gpt2
|
||||||
|
"transformer.embd.wte", # phi2
|
||||||
|
"model.tok_embeddings", # internlm2
|
||||||
|
"model.embedding", # mamba-qbert
|
||||||
|
"backbone.embedding", # mamba
|
||||||
|
"backbone.embeddings", # mamba-hf
|
||||||
|
"transformer.in_out_embed", # Grok
|
||||||
|
"embedding.word_embeddings", # chatglm
|
||||||
|
"transformer.token_embeddings", # openelm
|
||||||
|
"shared", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
# Token type embeddings
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES: (
|
||||||
|
"embeddings.token_type_embeddings", # bert nomic-bert
|
||||||
|
),
|
||||||
|
|
||||||
|
# Normalization of token embeddings
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
||||||
|
"word_embeddings_layernorm", # bloom
|
||||||
|
"embeddings.LayerNorm", # bert
|
||||||
|
"emb_ln", # nomic-bert
|
||||||
|
"transformer.norm", # openelm
|
||||||
|
),
|
||||||
|
|
||||||
|
# Position embeddings
|
||||||
|
MODEL_TENSOR.POS_EMBD: (
|
||||||
|
"transformer.wpe", # gpt2
|
||||||
|
"embeddings.position_embeddings", # bert
|
||||||
|
"wpe", # gpt2
|
||||||
|
),
|
||||||
|
|
||||||
|
# Output
|
||||||
|
MODEL_TENSOR.OUTPUT: (
|
||||||
|
"embed_out", # gptneox
|
||||||
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais
|
||||||
|
"output", # llama-pth bloom internlm2
|
||||||
|
"word_embeddings_for_head", # persimmon
|
||||||
|
"lm_head.linear", # phi2
|
||||||
|
"output_layer", # chatglm
|
||||||
|
),
|
||||||
|
|
||||||
|
# Output norm
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM: (
|
||||||
|
"gpt_neox.final_layer_norm", # gptneox
|
||||||
|
"transformer.ln_f", # gpt2 gpt-j falcon jais
|
||||||
|
"model.norm", # llama-hf baichuan internlm2
|
||||||
|
"norm", # llama-pth
|
||||||
|
"transformer.norm_f", # mpt dbrx
|
||||||
|
"ln_f", # refact bloom qwen gpt2
|
||||||
|
"language_model.encoder.final_layernorm", # persimmon
|
||||||
|
"model.final_layernorm", # persimmon
|
||||||
|
"lm_head.ln", # phi2
|
||||||
|
"model.norm_f", # mamba-qbert
|
||||||
|
"backbone.norm_f", # mamba
|
||||||
|
"transformer.rms_norm", # Grok
|
||||||
|
"encoder.final_layernorm", # chatglm
|
||||||
|
"transformer.norm", # openelm
|
||||||
|
),
|
||||||
|
|
||||||
|
# Rope frequencies
|
||||||
|
MODEL_TENSOR.ROPE_FREQS: (
|
||||||
|
"rope.freqs", # llama-pth
|
||||||
|
"rotary_pos_emb.inv_freq", # chatglm
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||||
|
# Attention norm
|
||||||
|
MODEL_TENSOR.ATTN_NORM: (
|
||||||
|
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||||
|
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais
|
||||||
|
"transformer.blocks.{bid}.norm_1", # mpt
|
||||||
|
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||||
|
"h.{bid}.input_layernorm", # bloom
|
||||||
|
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||||
|
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||||
|
"layers.{bid}.attention_norm", # llama-pth
|
||||||
|
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.ln1", # yi
|
||||||
|
"h.{bid}.ln_1", # gpt2
|
||||||
|
"transformer.h.{bid}.ln", # phi2
|
||||||
|
"model.layers.layers.{bid}.norm", # plamo
|
||||||
|
"model.layers.{bid}.attention_norm", # internlm2
|
||||||
|
"model.layers.{bid}.norm", # mamba-qbert
|
||||||
|
"backbone.layers.{bid}.norm", # mamba
|
||||||
|
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
||||||
|
"encoder.layers.{bid}.input_layernorm", # chatglm
|
||||||
|
"transformer.layers.{bid}.attn_norm", # openelm
|
||||||
|
),
|
||||||
|
|
||||||
|
# Attention norm 2
|
||||||
|
MODEL_TENSOR.ATTN_NORM_2: (
|
||||||
|
"transformer.h.{bid}.ln_attn", # falcon40b
|
||||||
|
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
|
||||||
|
),
|
||||||
|
|
||||||
|
# Attention query-key-value
|
||||||
|
MODEL_TENSOR.ATTN_QKV: (
|
||||||
|
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||||
|
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen jais
|
||||||
|
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx
|
||||||
|
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||||
|
"h.{bid}.self_attention.query_key_value", # bloom
|
||||||
|
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
||||||
|
"model.layers.{bid}.self_attn.query_key_value", # persimmon
|
||||||
|
"h.{bid}.attn.c_attn", # gpt2
|
||||||
|
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||||
|
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
||||||
|
"model.layers.{bid}.self_attn.qkv_proj", # phi3
|
||||||
|
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
|
||||||
|
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
||||||
|
),
|
||||||
|
|
||||||
|
# Attention query
|
||||||
|
MODEL_TENSOR.ATTN_Q: (
|
||||||
|
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
||||||
|
"layers.{bid}.attention.wq", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.self.query", # bert
|
||||||
|
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||||
|
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
||||||
|
"model.layers.{bid}.attention.wq", # internlm2
|
||||||
|
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
|
||||||
|
),
|
||||||
|
|
||||||
|
# Attention key
|
||||||
|
MODEL_TENSOR.ATTN_K: (
|
||||||
|
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
||||||
|
"layers.{bid}.attention.wk", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.self.key", # bert
|
||||||
|
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||||
|
"transformer.h.{bid}.attn.k", # refact
|
||||||
|
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
||||||
|
"model.layers.{bid}.attention.wk", # internlm2
|
||||||
|
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
|
||||||
|
),
|
||||||
|
|
||||||
|
# Attention value
|
||||||
|
MODEL_TENSOR.ATTN_V: (
|
||||||
|
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
||||||
|
"layers.{bid}.attention.wv", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.self.value", # bert
|
||||||
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||||
|
"transformer.h.{bid}.attn.v", # refact
|
||||||
|
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
||||||
|
"model.layers.{bid}.attention.wv", # internlm2
|
||||||
|
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
|
||||||
|
),
|
||||||
|
|
||||||
|
# Attention output
|
||||||
|
MODEL_TENSOR.ATTN_OUT: (
|
||||||
|
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||||
|
"transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen jais
|
||||||
|
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||||
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
|
"h.{bid}.self_attention.dense", # bloom
|
||||||
|
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
||||||
|
"layers.{bid}.attention.wo", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||||
|
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||||
|
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
|
||||||
|
"model.layers.{bid}.self_attn.dense", # persimmon
|
||||||
|
"h.{bid}.attn.c_proj", # gpt2
|
||||||
|
"transformer.h.{bid}.mixer.out_proj", # phi2
|
||||||
|
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||||
|
"model.layers.{bid}.attention.wo", # internlm2
|
||||||
|
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
||||||
|
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
|
||||||
|
"encoder.layers.{bid}.self_attention.dense", # chatglm
|
||||||
|
"transformer.layers.{bid}.attn.out_proj", # openelm
|
||||||
|
),
|
||||||
|
|
||||||
|
# Attention output norm
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM: (
|
||||||
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
|
"encoder.layers.{bid}.norm1", # nomic-bert
|
||||||
|
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_POST_NORM: (
|
||||||
|
"model.layers.{bid}.post_attention_layernorm", # gemma2
|
||||||
|
),
|
||||||
|
|
||||||
|
# Rotary embeddings
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||||
|
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||||
|
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
|
||||||
|
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
|
||||||
|
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
|
||||||
|
),
|
||||||
|
|
||||||
|
# Feed-forward norm
|
||||||
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
|
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||||
|
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais
|
||||||
|
"h.{bid}.post_attention_layernorm", # bloom
|
||||||
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||||
|
"layers.{bid}.ffn_norm", # llama-pth
|
||||||
|
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.ln2", # yi
|
||||||
|
"h.{bid}.ln_2", # gpt2
|
||||||
|
"model.layers.{bid}.ffn_norm", # internlm2
|
||||||
|
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||||
|
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
|
||||||
|
"transformer.layers.{bid}.ffn_norm", # openelm
|
||||||
|
),
|
||||||
|
|
||||||
|
# Post feed-forward norm
|
||||||
|
MODEL_TENSOR.FFN_PRE_NORM: (
|
||||||
|
"model.layers.{bid}.pre_feedforward_layernorm", # gemma2
|
||||||
|
),
|
||||||
|
|
||||||
|
# Post feed-forward norm
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM: (
|
||||||
|
"model.layers.{bid}.post_feedforward_layernorm", # gemma2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
|
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||||
|
"model.layers.{bid}.mlp.gate", # qwen2moe
|
||||||
|
"transformer.decoder_layer.{bid}.router", # Grok
|
||||||
|
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||||
|
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
|
||||||
|
),
|
||||||
|
|
||||||
|
# Feed-forward up
|
||||||
|
MODEL_TENSOR.FFN_UP: (
|
||||||
|
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||||
|
"transformer.h.{bid}.mlp.c_fc", # gpt2 jais
|
||||||
|
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||||
|
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||||
|
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
||||||
|
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
||||||
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||||
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||||
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||||
|
"transformer.h.{bid}.mlp.linear_3", # refact
|
||||||
|
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
|
"model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
|
"transformer.h.{bid}.mlp.w1", # qwen
|
||||||
|
"h.{bid}.mlp.c_fc", # gpt2
|
||||||
|
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||||
|
"model.layers.{bid}.mlp.fc1", # phi2
|
||||||
|
"model.layers.{bid}.mlp.gate_up_proj", # phi3
|
||||||
|
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||||
|
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||||
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||||
|
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||||
|
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
||||||
|
"model.layers.{bid}.residual_mlp.w3", # arctic
|
||||||
|
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
|
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||||
|
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||||
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||||
|
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged)
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||||
|
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||||
|
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
# AWQ-activation gate
|
||||||
|
MODEL_TENSOR.FFN_ACT: (
|
||||||
|
"transformer.blocks.{bid}.ffn.act", # mpt
|
||||||
|
),
|
||||||
|
|
||||||
|
# Feed-forward gate
|
||||||
|
MODEL_TENSOR.FFN_GATE: (
|
||||||
|
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||||
|
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||||
|
"transformer.h.{bid}.mlp.w2", # qwen
|
||||||
|
"transformer.h.{bid}.mlp.c_fc2", # jais
|
||||||
|
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||||
|
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||||
|
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||||
|
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
||||||
|
"transformer.h.{bid}.mlp.linear_1", # refact
|
||||||
|
"model.layers.{bid}.residual_mlp.w1", # arctic
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
|
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
||||||
|
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||||
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
||||||
|
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged)
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||||
|
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||||
|
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
# Feed-forward down
|
||||||
|
MODEL_TENSOR.FFN_DOWN: (
|
||||||
|
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||||
|
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen jais
|
||||||
|
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||||
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||||
|
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
||||||
|
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
||||||
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||||
|
"encoder.layer.{bid}.output.dense", # bert
|
||||||
|
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||||
|
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||||
|
"model.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||||
|
"h.{bid}.mlp.c_proj", # gpt2
|
||||||
|
"transformer.h.{bid}.mlp.fc2", # phi2
|
||||||
|
"model.layers.{bid}.mlp.fc2", # phi2
|
||||||
|
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
||||||
|
"model.layers.{bid}.feed_forward.w2", # internlm2
|
||||||
|
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||||
|
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||||
|
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
||||||
|
"transformer.layers.{bid}.ffn.proj_2", # openelm
|
||||||
|
"model.layers.{bid}.residual_mlp.w2", # arctic
|
||||||
|
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
|
||||||
|
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
|
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||||
|
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||||
|
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||||
|
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged)
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||||
|
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||||
|
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
|
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||||
|
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||||
|
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
||||||
|
"transformer.layers.{bid}.attn.q_norm", # openelm
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM: (
|
||||||
|
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||||
|
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||||
|
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
||||||
|
"transformer.layers.{bid}.attn.k_norm", # openelm
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ROPE_FREQS: (
|
||||||
|
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM: (
|
||||||
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
|
"encoder.layers.{bid}.norm2", # nomic-bert
|
||||||
|
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
||||||
|
"encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
|
||||||
|
"encoder.layer.{bid}.layer_norm_2" # jina-v2-code
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_IN: (
|
||||||
|
"model.layers.{bid}.in_proj",
|
||||||
|
"backbone.layers.{bid}.mixer.in_proj",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_CONV1D: (
|
||||||
|
"model.layers.{bid}.conv1d",
|
||||||
|
"backbone.layers.{bid}.mixer.conv1d",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_X: (
|
||||||
|
"model.layers.{bid}.x_proj",
|
||||||
|
"backbone.layers.{bid}.mixer.x_proj",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_DT: (
|
||||||
|
"model.layers.{bid}.dt_proj",
|
||||||
|
"backbone.layers.{bid}.mixer.dt_proj",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_A: (
|
||||||
|
"model.layers.{bid}.A_log",
|
||||||
|
"backbone.layers.{bid}.mixer.A_log",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_D: (
|
||||||
|
"model.layers.{bid}.D",
|
||||||
|
"backbone.layers.{bid}.mixer.D",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.SSM_OUT: (
|
||||||
|
"model.layers.{bid}.out_proj",
|
||||||
|
"backbone.layers.{bid}.mixer.out_proj",
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_Q_A: (
|
||||||
|
"model.layers.{bid}.self_attn.q_a_proj", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_Q_B: (
|
||||||
|
"model.layers.{bid}.self_attn.q_b_proj", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_MQA: (
|
||||||
|
"model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_KV_B: (
|
||||||
|
"model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_Q_A_NORM: (
|
||||||
|
"model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
||||||
|
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_SUB_NORM: (
|
||||||
|
"model.layers.{bid}.self_attn.inner_attn_ln", # bitnet
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.FFN_SUB_NORM: (
|
||||||
|
"model.layers.{bid}.mlp.ffn_layernorm", # bitnet
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.0.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_Q: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_K: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_V: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_OUT: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_ATTN_REL_B: (
|
||||||
|
"decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.1.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
|
||||||
|
"decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_NORM: (
|
||||||
|
"decoder.block.{bid}.layer.2.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_GATE: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_UP: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_FFN_DOWN: (
|
||||||
|
"decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.DEC_OUTPUT_NORM: (
|
||||||
|
"decoder.final_layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_NORM: (
|
||||||
|
"encoder.block.{bid}.layer.0.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_Q: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.q", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_K: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.k", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_V: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.v", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_OUT: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.o", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_ATTN_REL_B: (
|
||||||
|
"encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_NORM: (
|
||||||
|
"encoder.block.{bid}.layer.1.layer_norm", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_GATE: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_UP: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_FFN_DOWN: (
|
||||||
|
"encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
|
||||||
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
||||||
|
"encoder.final_layer_norm", # t5
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
# architecture-specific block mappings
|
||||||
|
arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
|
||||||
|
MODEL_ARCH.ARCTIC: {
|
||||||
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
|
"model.layers.{bid}.residual_layernorm",
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.FFN_NORM_EXP: (
|
||||||
|
"model.layers.{bid}.post_attention_layernorm",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||||
|
|
||||||
|
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
|
||||||
|
self.mapping = {}
|
||||||
|
for tensor, keys in self.mappings_cfg.items():
|
||||||
|
if tensor not in MODEL_TENSORS[arch]:
|
||||||
|
continue
|
||||||
|
tensor_name = TENSOR_NAMES[tensor]
|
||||||
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
|
for key in keys:
|
||||||
|
self.mapping[key] = (tensor, tensor_name)
|
||||||
|
if arch in self.arch_block_mappings_cfg:
|
||||||
|
self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch])
|
||||||
|
for bid in range(n_blocks):
|
||||||
|
for tensor, keys in self.block_mappings_cfg.items():
|
||||||
|
if tensor not in MODEL_TENSORS[arch]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
|
||||||
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
|
for key in keys:
|
||||||
|
key = key.format(bid = bid)
|
||||||
|
self.mapping[key] = (tensor, tensor_name)
|
||||||
|
|
||||||
|
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||||
|
result = self.mapping.get(key)
|
||||||
|
if result is not None:
|
||||||
|
return result
|
||||||
|
for suffix in try_suffixes:
|
||||||
|
if key.endswith(suffix):
|
||||||
|
result = self.mapping.get(key[:-len(suffix)])
|
||||||
|
if result is not None:
|
||||||
|
return result[0], result[1] + suffix
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
|
||||||
|
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||||
|
if result is None:
|
||||||
|
return None
|
||||||
|
return result[1]
|
||||||
|
|
||||||
|
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
|
||||||
|
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||||
|
if result is None:
|
||||||
|
return None
|
||||||
|
return result[0]
|
||||||
|
|
||||||
|
def __getitem__(self, key: str) -> str:
|
||||||
|
try:
|
||||||
|
return self.mapping[key][1]
|
||||||
|
except KeyError:
|
||||||
|
raise KeyError(key)
|
||||||
|
|
||||||
|
def __contains__(self, key: str) -> bool:
|
||||||
|
return key in self.mapping
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return repr(self.mapping)
|
||||||
|
|
||||||
|
|
||||||
|
def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
|
||||||
|
return TensorNameMap(arch, n_blocks)
|
|
@ -0,0 +1,69 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
|
||||||
|
def fill_templated_filename(filename: str, output_type: str | None) -> str:
|
||||||
|
# Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf'
|
||||||
|
ftype_lowercase: str = output_type.lower() if output_type is not None else ""
|
||||||
|
ftype_uppercase: str = output_type.upper() if output_type is not None else ""
|
||||||
|
return filename.format(ftype_lowercase,
|
||||||
|
outtype=ftype_lowercase, ftype=ftype_lowercase,
|
||||||
|
OUTTYPE=ftype_uppercase, FTYPE=ftype_uppercase)
|
||||||
|
|
||||||
|
|
||||||
|
def model_weight_count_rounded_notation(model_params_count: int, min_digits: int = 2) -> str:
|
||||||
|
if model_params_count > 1e12 :
|
||||||
|
# Trillions Of Parameters
|
||||||
|
scaled_model_params = model_params_count * 1e-12
|
||||||
|
scale_suffix = "T"
|
||||||
|
elif model_params_count > 1e9 :
|
||||||
|
# Billions Of Parameters
|
||||||
|
scaled_model_params = model_params_count * 1e-9
|
||||||
|
scale_suffix = "B"
|
||||||
|
elif model_params_count > 1e6 :
|
||||||
|
# Millions Of Parameters
|
||||||
|
scaled_model_params = model_params_count * 1e-6
|
||||||
|
scale_suffix = "M"
|
||||||
|
else:
|
||||||
|
# Thousands Of Parameters
|
||||||
|
scaled_model_params = model_params_count * 1e-3
|
||||||
|
scale_suffix = "K"
|
||||||
|
|
||||||
|
fix = max(min_digits - len(str(round(scaled_model_params)).lstrip('0')), 0)
|
||||||
|
|
||||||
|
return f"{scaled_model_params:.{fix}f}{scale_suffix}"
|
||||||
|
|
||||||
|
|
||||||
|
def size_label(total_params: int, shared_params: int, expert_params: int, expert_count: int) -> str:
|
||||||
|
|
||||||
|
if expert_count > 0:
|
||||||
|
pretty_size = model_weight_count_rounded_notation(abs(shared_params) + abs(expert_params), min_digits=2)
|
||||||
|
size_class = f"{expert_count}x{pretty_size}"
|
||||||
|
else:
|
||||||
|
size_class = model_weight_count_rounded_notation(abs(total_params), min_digits=2)
|
||||||
|
|
||||||
|
return size_class
|
||||||
|
|
||||||
|
|
||||||
|
def naming_convention(model_name: str | None, base_name: str | None, finetune_string: str | None, version_string: str | None, size_label: str | None, output_type: str | None, model_type: Literal['vocab', 'LoRA'] | None = None) -> str:
|
||||||
|
# Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
|
||||||
|
|
||||||
|
if base_name is not None:
|
||||||
|
name = base_name.strip().replace(' ', '-').replace('/', '-')
|
||||||
|
elif model_name is not None:
|
||||||
|
name = model_name.strip().replace(' ', '-').replace('/', '-')
|
||||||
|
else:
|
||||||
|
name = "ggml-model"
|
||||||
|
|
||||||
|
parameters = f"-{size_label}" if size_label is not None else ""
|
||||||
|
|
||||||
|
finetune = f"-{finetune_string.strip().replace(' ', '-')}" if finetune_string is not None else ""
|
||||||
|
|
||||||
|
version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else ""
|
||||||
|
|
||||||
|
encoding = f"-{output_type.strip().replace(' ', '-').upper()}" if output_type is not None else ""
|
||||||
|
|
||||||
|
kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""
|
||||||
|
|
||||||
|
return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
|
|
@ -0,0 +1,465 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable
|
||||||
|
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
from .gguf_writer import GGUFWriter
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class SpecialVocab:
|
||||||
|
merges: list[str]
|
||||||
|
add_special_token: dict[str, bool]
|
||||||
|
special_token_ids: dict[str, int]
|
||||||
|
chat_template: str | Sequence[Mapping[str, str]] | None
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||||
|
special_token_types: Iterable[str] | None = None,
|
||||||
|
n_vocab: int | None = None,
|
||||||
|
):
|
||||||
|
self.special_token_ids = {}
|
||||||
|
self.add_special_token = {}
|
||||||
|
self.n_vocab = n_vocab
|
||||||
|
self.load_merges = load_merges
|
||||||
|
self.merges = []
|
||||||
|
self.chat_template = None
|
||||||
|
if special_token_types is not None:
|
||||||
|
self.special_token_types = special_token_types
|
||||||
|
else:
|
||||||
|
self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask')
|
||||||
|
self._load(Path(path))
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return '<SpecialVocab with {} merges, special tokens {}, add special tokens {}>'.format(
|
||||||
|
len(self.merges), self.special_token_ids or "unset", self.add_special_token or "unset",
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
|
||||||
|
if self.merges:
|
||||||
|
if not quiet:
|
||||||
|
logger.info(f'Adding {len(self.merges)} merge(s).')
|
||||||
|
gw.add_token_merges(self.merges)
|
||||||
|
elif self.load_merges:
|
||||||
|
logger.warning('Adding merges requested but no merges found, output may be non-functional.')
|
||||||
|
for typ, tokid in self.special_token_ids.items():
|
||||||
|
id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
|
||||||
|
if id_handler is None:
|
||||||
|
logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping')
|
||||||
|
continue
|
||||||
|
if not quiet:
|
||||||
|
logger.info(f'Setting special token type {typ} to {tokid}')
|
||||||
|
id_handler(tokid)
|
||||||
|
for typ, value in self.add_special_token.items():
|
||||||
|
add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None)
|
||||||
|
if add_handler is None:
|
||||||
|
logger.warning(f'No handler for add_{typ}_token with value {value} - skipping')
|
||||||
|
continue
|
||||||
|
if not quiet:
|
||||||
|
logger.info(f'Setting add_{typ}_token to {value}')
|
||||||
|
add_handler(value)
|
||||||
|
if self.chat_template is not None:
|
||||||
|
if not quiet:
|
||||||
|
logger.info(f'Setting chat_template to {self.chat_template}')
|
||||||
|
gw.add_chat_template(self.chat_template)
|
||||||
|
|
||||||
|
def _load(self, path: Path) -> None:
|
||||||
|
self._try_load_from_tokenizer_json(path)
|
||||||
|
self._try_load_from_config_json(path)
|
||||||
|
if self.load_merges and not self.merges:
|
||||||
|
self._try_load_merges_txt(path)
|
||||||
|
|
||||||
|
def _try_load_merges_txt(self, path: Path) -> bool:
|
||||||
|
merges_file = path / 'merges.txt'
|
||||||
|
if not merges_file.is_file():
|
||||||
|
return False
|
||||||
|
with open(merges_file, 'r', encoding = 'utf-8') as fp:
|
||||||
|
first_line = next(fp, '').strip()
|
||||||
|
if not first_line.startswith('#'):
|
||||||
|
fp.seek(0)
|
||||||
|
line_num = 0
|
||||||
|
else:
|
||||||
|
line_num = 1
|
||||||
|
merges = []
|
||||||
|
for line in fp:
|
||||||
|
line_num += 1
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
parts = line.split(None, 3)
|
||||||
|
if len(parts) != 2:
|
||||||
|
logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring')
|
||||||
|
continue
|
||||||
|
merges.append(f'{parts[0]} {parts[1]}')
|
||||||
|
self.merges = merges
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _set_special_token(self, typ: str, tid: Any) -> None:
|
||||||
|
if not isinstance(tid, int):
|
||||||
|
return
|
||||||
|
if tid < 0:
|
||||||
|
raise ValueError(f'invalid value for special token type {typ}: {tid}')
|
||||||
|
if self.n_vocab is None or tid < self.n_vocab:
|
||||||
|
if typ in self.special_token_ids:
|
||||||
|
return
|
||||||
|
self.special_token_ids[typ] = tid
|
||||||
|
return
|
||||||
|
logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
|
||||||
|
|
||||||
|
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||||
|
tokenizer_file = path / 'tokenizer.json'
|
||||||
|
if tokenizer_file.is_file():
|
||||||
|
with open(tokenizer_file, encoding = 'utf-8') as f:
|
||||||
|
tokenizer = json.load(f)
|
||||||
|
if self.load_merges:
|
||||||
|
merges = tokenizer.get('model', {}).get('merges')
|
||||||
|
if isinstance(merges, list) and merges and isinstance(merges[0], str):
|
||||||
|
self.merges = merges
|
||||||
|
added_tokens = tokenizer.get('added_tokens', {})
|
||||||
|
else:
|
||||||
|
added_tokens = {}
|
||||||
|
tokenizer_config_file = path / 'tokenizer_config.json'
|
||||||
|
if not tokenizer_config_file.is_file():
|
||||||
|
return True
|
||||||
|
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
||||||
|
tokenizer_config = json.load(f)
|
||||||
|
chat_template = tokenizer_config.get('chat_template')
|
||||||
|
if chat_template is None or isinstance(chat_template, (str, list)):
|
||||||
|
self.chat_template = chat_template
|
||||||
|
else:
|
||||||
|
logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring')
|
||||||
|
for typ in self.special_token_types:
|
||||||
|
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
||||||
|
if isinstance(add_entry, bool):
|
||||||
|
self.add_special_token[typ] = add_entry
|
||||||
|
entry = tokenizer_config.get(f'{typ}_token')
|
||||||
|
if isinstance(entry, str):
|
||||||
|
tc_content = entry
|
||||||
|
elif isinstance(entry, dict):
|
||||||
|
entry_content = entry.get('content')
|
||||||
|
if not isinstance(entry_content, str):
|
||||||
|
continue
|
||||||
|
tc_content = entry_content
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
# We only need the first match here.
|
||||||
|
maybe_token_id = next(
|
||||||
|
(atok.get('id') for atok in added_tokens if atok.get('content') == tc_content),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
self._set_special_token(typ, maybe_token_id)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _try_load_from_config_json(self, path: Path) -> bool:
|
||||||
|
config_file = path / 'config.json'
|
||||||
|
if not config_file.is_file():
|
||||||
|
return False
|
||||||
|
with open(config_file, encoding = 'utf-8') as f:
|
||||||
|
config = json.load(f)
|
||||||
|
for typ in self.special_token_types:
|
||||||
|
self._set_special_token(typ, config.get(f'{typ}_token_id'))
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class BaseVocab(Protocol):
|
||||||
|
tokenizer_model: ClassVar[str]
|
||||||
|
name: ClassVar[str]
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class Vocab(BaseVocab, Protocol):
|
||||||
|
vocab_size: int
|
||||||
|
added_tokens_dict: dict[str, int]
|
||||||
|
added_tokens_list: list[str]
|
||||||
|
fname_tokenizer: Path
|
||||||
|
|
||||||
|
def __init__(self, base_path: Path): ...
|
||||||
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
|
||||||
|
|
||||||
|
|
||||||
|
class NoVocab(BaseVocab):
|
||||||
|
tokenizer_model = "no_vocab"
|
||||||
|
name = "no_vocab"
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return "<NoVocab for a model without integrated vocabulary>"
|
||||||
|
|
||||||
|
|
||||||
|
class BpeVocab(Vocab):
|
||||||
|
tokenizer_model = "gpt2"
|
||||||
|
name = "bpe"
|
||||||
|
|
||||||
|
def __init__(self, base_path: Path):
|
||||||
|
added_tokens: dict[str, int] = {}
|
||||||
|
|
||||||
|
if (fname_tokenizer := base_path / 'vocab.json').exists():
|
||||||
|
# "slow" tokenizer
|
||||||
|
with open(fname_tokenizer, encoding="utf-8") as f:
|
||||||
|
self.vocab = json.load(f)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||||
|
with open(base_path / 'added_tokens.json', encoding="utf-8") as f:
|
||||||
|
added_tokens = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# "fast" tokenizer
|
||||||
|
fname_tokenizer = base_path / 'tokenizer.json'
|
||||||
|
|
||||||
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
|
with open(fname_tokenizer, encoding="utf-8") as f:
|
||||||
|
tokenizer_json = json.load(f)
|
||||||
|
|
||||||
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
|
if (
|
||||||
|
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
|
||||||
|
or tokenizer_json['decoder']['type'] != 'ByteLevel'
|
||||||
|
):
|
||||||
|
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
|
||||||
|
|
||||||
|
self.vocab = tokenizer_model["vocab"]
|
||||||
|
|
||||||
|
if (added := tokenizer_json.get('added_tokens')) is not None:
|
||||||
|
# Added tokens here can be duplicates of the main vocabulary.
|
||||||
|
added_tokens = {item['content']: item['id']
|
||||||
|
for item in added
|
||||||
|
if item['content'] not in self.vocab}
|
||||||
|
|
||||||
|
vocab_size = len(self.vocab)
|
||||||
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
|
actual_ids = sorted(added_tokens.values())
|
||||||
|
if expected_ids != actual_ids:
|
||||||
|
expected_end_id = vocab_size + len(actual_ids) - 1
|
||||||
|
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
|
||||||
|
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
|
||||||
|
|
||||||
|
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||||
|
self.added_tokens_dict = added_tokens
|
||||||
|
self.added_tokens_list = [text for (text, idx) in items]
|
||||||
|
self.vocab_size_base = vocab_size
|
||||||
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
|
self.fname_tokenizer = fname_tokenizer
|
||||||
|
|
||||||
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
||||||
|
|
||||||
|
for i, _ in enumerate(self.vocab):
|
||||||
|
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
||||||
|
|
||||||
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
for text in self.added_tokens_list:
|
||||||
|
score = -1000.0
|
||||||
|
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
|
||||||
|
|
||||||
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
yield from self.bpe_tokens()
|
||||||
|
yield from self.added_tokens()
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
|
class SentencePieceVocab(Vocab):
|
||||||
|
tokenizer_model = "llama"
|
||||||
|
name = "spm"
|
||||||
|
|
||||||
|
def __init__(self, base_path: Path):
|
||||||
|
added_tokens: dict[str, int] = {}
|
||||||
|
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
|
||||||
|
# normal location
|
||||||
|
try:
|
||||||
|
with open(base_path / 'added_tokens.json', encoding="utf-8") as f:
|
||||||
|
added_tokens = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
|
||||||
|
# not found in alternate location either
|
||||||
|
raise FileNotFoundError('Cannot find tokenizer.model')
|
||||||
|
|
||||||
|
self.sentencepiece_tokenizer = SentencePieceProcessor()
|
||||||
|
self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
|
||||||
|
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
||||||
|
|
||||||
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||||
|
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||||
|
actual_new_ids = sorted(new_tokens.keys())
|
||||||
|
|
||||||
|
if expected_new_ids != actual_new_ids:
|
||||||
|
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||||
|
|
||||||
|
# Token pieces that were added to the base vocabulary.
|
||||||
|
self.added_tokens_dict = added_tokens
|
||||||
|
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||||
|
self.vocab_size_base = vocab_size
|
||||||
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
|
self.fname_tokenizer = fname_tokenizer
|
||||||
|
|
||||||
|
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
tokenizer = self.sentencepiece_tokenizer
|
||||||
|
for i in range(tokenizer.vocab_size()):
|
||||||
|
piece = tokenizer.IdToPiece(i)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score: float = tokenizer.GetScore(i)
|
||||||
|
|
||||||
|
toktype = gguf.TokenType.NORMAL
|
||||||
|
if tokenizer.IsUnknown(i):
|
||||||
|
toktype = gguf.TokenType.UNKNOWN
|
||||||
|
if tokenizer.IsControl(i):
|
||||||
|
toktype = gguf.TokenType.CONTROL
|
||||||
|
|
||||||
|
# NOTE: I think added_tokens are user defined.
|
||||||
|
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
||||||
|
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
||||||
|
|
||||||
|
if tokenizer.IsUnused(i):
|
||||||
|
toktype = gguf.TokenType.UNUSED
|
||||||
|
if tokenizer.IsByte(i):
|
||||||
|
toktype = gguf.TokenType.BYTE
|
||||||
|
|
||||||
|
yield text, score, toktype
|
||||||
|
|
||||||
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
for text in self.added_tokens_list:
|
||||||
|
score = -1000.0
|
||||||
|
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
||||||
|
|
||||||
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
yield from self.sentencepiece_tokens()
|
||||||
|
yield from self.added_tokens()
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
|
class LlamaHfVocab(Vocab):
|
||||||
|
tokenizer_model = "llama"
|
||||||
|
name = "hfft"
|
||||||
|
|
||||||
|
def __init__(self, base_path: Path):
|
||||||
|
fname_tokenizer = base_path / 'tokenizer.json'
|
||||||
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
|
with open(fname_tokenizer, encoding='utf-8') as f:
|
||||||
|
tokenizer_json = json.load(f)
|
||||||
|
|
||||||
|
# pre-check so we know if we need transformers
|
||||||
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
|
is_llama3 = (
|
||||||
|
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
|
||||||
|
and not tokenizer_model.get('byte_fallback', True)
|
||||||
|
)
|
||||||
|
if is_llama3:
|
||||||
|
raise TypeError('Llama 3 must be converted with BpeVocab')
|
||||||
|
|
||||||
|
if not is_llama3 and (
|
||||||
|
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
||||||
|
or tokenizer_json['decoder']['type'] != 'Sequence'
|
||||||
|
):
|
||||||
|
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
|
||||||
|
|
||||||
|
try:
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError(
|
||||||
|
"To use LlamaHfVocab, please install the `transformers` package. "
|
||||||
|
"You can install it with `pip install transformers`."
|
||||||
|
) from e
|
||||||
|
|
||||||
|
# Allow the tokenizer to default to slow or fast versions.
|
||||||
|
# Explicitly set tokenizer to use local paths.
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
base_path,
|
||||||
|
cache_dir=base_path,
|
||||||
|
local_files_only=True,
|
||||||
|
)
|
||||||
|
assert self.tokenizer.is_fast # assume tokenizer.json is used
|
||||||
|
|
||||||
|
# Initialize lists and dictionaries for added tokens
|
||||||
|
self.added_tokens_list = []
|
||||||
|
self.added_tokens_dict = dict()
|
||||||
|
self.added_tokens_ids = set()
|
||||||
|
|
||||||
|
# Process added tokens
|
||||||
|
for tok, tokidx in sorted(
|
||||||
|
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
|
# Only consider added tokens that are not in the base vocabulary
|
||||||
|
if tokidx >= self.tokenizer.vocab_size:
|
||||||
|
self.added_tokens_list.append(tok)
|
||||||
|
self.added_tokens_dict[tok] = tokidx
|
||||||
|
self.added_tokens_ids.add(tokidx)
|
||||||
|
|
||||||
|
# Store special tokens and their IDs
|
||||||
|
self.specials = {
|
||||||
|
tok: self.tokenizer.get_vocab()[tok]
|
||||||
|
for tok in self.tokenizer.all_special_tokens
|
||||||
|
}
|
||||||
|
self.special_ids = set(self.tokenizer.all_special_ids)
|
||||||
|
|
||||||
|
# Set vocabulary sizes
|
||||||
|
self.vocab_size_base = self.tokenizer.vocab_size
|
||||||
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
|
|
||||||
|
self.fname_tokenizer = fname_tokenizer
|
||||||
|
|
||||||
|
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
reverse_vocab = {
|
||||||
|
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
|
||||||
|
}
|
||||||
|
|
||||||
|
for token_id in range(self.vocab_size_base):
|
||||||
|
# Skip processing added tokens here
|
||||||
|
if token_id in self.added_tokens_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Convert token text to bytes
|
||||||
|
token_text = reverse_vocab[token_id].encode("utf-8")
|
||||||
|
|
||||||
|
# Yield token text, score, and type
|
||||||
|
yield token_text, self.get_token_score(token_id), self.get_token_type(
|
||||||
|
token_id, token_text, self.special_ids # Reuse already stored special IDs
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
|
||||||
|
# Special case for byte tokens
|
||||||
|
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
|
||||||
|
return gguf.TokenType.BYTE
|
||||||
|
|
||||||
|
# Determine token type based on whether it's a special token
|
||||||
|
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
|
||||||
|
|
||||||
|
def get_token_score(self, token_id: int) -> float:
|
||||||
|
# Placeholder for actual logic to determine the token's score
|
||||||
|
# This needs to be implemented based on specific requirements
|
||||||
|
return -1000.0 # Default score
|
||||||
|
|
||||||
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
for text in self.added_tokens_list:
|
||||||
|
if text in self.specials:
|
||||||
|
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
|
||||||
|
score = self.get_token_score(self.specials[text])
|
||||||
|
else:
|
||||||
|
toktype = gguf.TokenType.USER_DEFINED
|
||||||
|
score = -1000.0
|
||||||
|
|
||||||
|
yield text.encode("utf-8"), score, toktype
|
||||||
|
|
||||||
|
def has_newline_token(self):
|
||||||
|
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
|
||||||
|
|
||||||
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
yield from self.hf_tokens()
|
||||||
|
yield from self.added_tokens()
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
|
@ -169,7 +169,7 @@ def __init__(self):
|
||||||
self.CONTEXT_SIZE = ""
|
self.CONTEXT_SIZE = ""
|
||||||
self.CONTEXT_SIZE_FOR_IMATRIX = ""
|
self.CONTEXT_SIZE_FOR_IMATRIX = ""
|
||||||
self.THREADS = ""
|
self.THREADS = ""
|
||||||
self.NUMBER_OF_THREADS_FOR_IMATRIX = ""
|
self.NUMBER_OF_THREADS_FOR_IMATRIX = ""
|
||||||
|
|
||||||
class _English(_Localization):
|
class _English(_Localization):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -335,6 +335,7 @@ def __init__(self):
|
||||||
self.HOW_OFTEN_TO_SAVE_IMATRIX = "How often to save the IMatrix"
|
self.HOW_OFTEN_TO_SAVE_IMATRIX = "How often to save the IMatrix"
|
||||||
self.SET_GPU_OFFLOAD_VALUE = "Set GPU offload value (-ngl)"
|
self.SET_GPU_OFFLOAD_VALUE = "Set GPU offload value (-ngl)"
|
||||||
self.COMPLETED = "Completed"
|
self.COMPLETED = "Completed"
|
||||||
|
# TODO: Add the following keys to other languages
|
||||||
self.REFRESH_MODELS = "Refresh Models"
|
self.REFRESH_MODELS = "Refresh Models"
|
||||||
self.EXTRA_ARGUMENTS = "Extra Arguments:"
|
self.EXTRA_ARGUMENTS = "Extra Arguments:"
|
||||||
self.EXTRA_ARGUMENTS_LABEL = "Additional command-line arguments"
|
self.EXTRA_ARGUMENTS_LABEL = "Additional command-line arguments"
|
||||||
|
@ -342,6 +343,59 @@ def __init__(self):
|
||||||
self.CONTEXT_SIZE_FOR_IMATRIX = "Context size for IMatrix generation"
|
self.CONTEXT_SIZE_FOR_IMATRIX = "Context size for IMatrix generation"
|
||||||
self.THREADS = "Threads:"
|
self.THREADS = "Threads:"
|
||||||
self.NUMBER_OF_THREADS_FOR_IMATRIX = "Number of threads for IMatrix generation"
|
self.NUMBER_OF_THREADS_FOR_IMATRIX = "Number of threads for IMatrix generation"
|
||||||
|
self.LORA_CONVERSION = "LoRA Conversion"
|
||||||
|
self.LORA_INPUT_PATH = "LoRA Input Path"
|
||||||
|
self.LORA_OUTPUT_PATH = "LoRA Output Path"
|
||||||
|
self.SELECT_LORA_INPUT_DIRECTORY = "Select LoRA Input Directory"
|
||||||
|
self.SELECT_LORA_OUTPUT_FILE = "Select LoRA Output File"
|
||||||
|
self.CONVERT_LORA = "Convert LoRA"
|
||||||
|
self.STARTING_LORA_CONVERSION = "Starting LoRA Conversion"
|
||||||
|
self.LORA_INPUT_PATH_REQUIRED = "LoRA input path is required."
|
||||||
|
self.LORA_OUTPUT_PATH_REQUIRED = "LoRA output path is required."
|
||||||
|
self.ERROR_STARTING_LORA_CONVERSION = "Error starting LoRA conversion: {}"
|
||||||
|
self.LORA_CONVERSION_TASK_STARTED = "LoRA conversion task started."
|
||||||
|
self.BIN_FILES = "Binary Files (*.bin)"
|
||||||
|
self.BROWSING_FOR_LORA_INPUT_DIRECTORY = "Browsing for LoRA input directory..."
|
||||||
|
self.BROWSING_FOR_LORA_OUTPUT_FILE = "Browsing for LoRA output file..."
|
||||||
|
self.CONVERTING_LORA = "LoRA Conversion"
|
||||||
|
self.LORA_CONVERSION_FINISHED = "LoRA conversion finished."
|
||||||
|
self.LORA_FILE_MOVED = "LoRA file moved from {} to {}."
|
||||||
|
self.LORA_FILE_NOT_FOUND = "LoRA file not found: {}."
|
||||||
|
self.ERROR_MOVING_LORA_FILE = "Error moving LoRA file: {}"
|
||||||
|
self.EXPORT_LORA = "Export LoRA"
|
||||||
|
self.MODEL_PATH_REQUIRED = "Model path is required."
|
||||||
|
self.OUTPUT_PATH_REQUIRED = "Output path is required."
|
||||||
|
self.AT_LEAST_ONE_LORA_ADAPTER_REQUIRED = "At least one LoRA adapter is required."
|
||||||
|
self.INVALID_LORA_SCALE_VALUE = "Invalid LoRA scale value."
|
||||||
|
self.ERROR_STARTING_LORA_EXPORT = "Error starting LoRA export: {}"
|
||||||
|
self.LORA_EXPORT_TASK_STARTED = "LoRA export task started."
|
||||||
|
self.GGML_LORA_ADAPTERS = "GGML LoRA Adapters"
|
||||||
|
self.SELECT_LORA_ADAPTER_FILES = "Select LoRA Adapter Files"
|
||||||
|
self.ADD_ADAPTER = "Add Adapter"
|
||||||
|
self.DELETE_ADAPTER = "Delete"
|
||||||
|
self.LORA_SCALE = "LoRA Scale"
|
||||||
|
self.ENTER_LORA_SCALE_VALUE = "Enter LoRA Scale Value (Optional)"
|
||||||
|
self.NUMBER_OF_THREADS_FOR_LORA_EXPORT = "Number of Threads for LoRA Export"
|
||||||
|
self.EXPORTING_LORA = "Exporting LoRA..."
|
||||||
|
self.BROWSING_FOR_EXPORT_LORA_MODEL_FILE = "Browsing for Export LoRA Model File..."
|
||||||
|
self.BROWSING_FOR_EXPORT_LORA_OUTPUT_FILE = "Browsing for Export LoRA Output File..."
|
||||||
|
self.ADDING_LORA_ADAPTER = "Adding LoRA Adapter..."
|
||||||
|
self.DELETING_LORA_ADAPTER = "Deleting LoRA Adapter..."
|
||||||
|
self.LORA_FILES = "LoRA Files (*.bin)"
|
||||||
|
self.SELECT_LORA_ADAPTER_FILE = "Select LoRA Adapter File"
|
||||||
|
self.STARTING_LORA_EXPORT = "Starting LoRA export..."
|
||||||
|
self.OUTPUT_TYPE = "Output Type"
|
||||||
|
self.SELECT_OUTPUT_TYPE = "Select Output Type (GGUF or GGML)"
|
||||||
|
self.GGUF_AND_BIN_FILES = "GGUF and Binary Files (*.gguf *.bin)"
|
||||||
|
self.BASE_MODEL = "Base Model"
|
||||||
|
self.SELECT_BASE_MODEL_FILE = "Select Base Model File (GGUF)"
|
||||||
|
self.BASE_MODEL_PATH_REQUIRED = "Base model path is required for GGUF output."
|
||||||
|
self.BROWSING_FOR_BASE_MODEL_FILE = "Browsing for base model file..."
|
||||||
|
self.SELECT_BASE_MODEL_FOLDER = "Select Base Model Folder (containing safetensors)"
|
||||||
|
self.BROWSING_FOR_BASE_MODEL_FOLDER = "Browsing for base model folder..."
|
||||||
|
self.LORA_CONVERSION_FROM_TO = "LoRA Conversion from {} to {}"
|
||||||
|
self.GENERATING_IMATRIX_FOR = "Generating IMatrix for {}"
|
||||||
|
self.MODEL_PATH_REQUIRED_FOR_IMATRIX = "Model path is required for IMatrix generation."
|
||||||
|
|
||||||
class _French:
|
class _French:
|
||||||
# French localization
|
# French localization
|
||||||
|
|
Loading…
Reference in New Issue