mirror of https://github.com/leafspark/AutoGGUF
feat(core): update llama.cpp, improve backend UI, logging, and task handling
- update llama.cpp python to `bc098c3` (now adds support for Qwen3, Llama 4, etc.) - update requirements and general maint - UI fixes in AutoGGUF - Updated backend selection box to sort by newest version - Fixed log information box inserting newlines on open and autoscroll - Modified task deletion behavior - Fixed logging for cancellation/deletion - Updated readme information
This commit is contained in:
parent
b4817eee06
commit
35ad690198
27
README.md
27
README.md
|
@ -29,16 +29,16 @@ # AutoGGUF - automated GGUF model quantizer
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- 📩 Download and manage llama.cpp backends
|
- 📩 Update and manage llama.cpp backends
|
||||||
- 🗃️ Select and quantize GGUF models
|
- 🗃️ Download and quantize GGUF/safetensors models
|
||||||
- 📐 Configure quantization parameters
|
- 📐 Configure quantization parameters
|
||||||
- 💻 Monitor system resources during quantization
|
- 💻 Monitor system resources in real time during quantization
|
||||||
- ⏳ Parallel quantization + imatrix generation
|
- ⏳ Parallel quantization + imatrix generation
|
||||||
- 🎉 LoRA conversion and merging
|
- 🎉 LoRA conversion and merging
|
||||||
- 📁 Preset saving and loading
|
- 📁 Preset saving and loading
|
||||||
- 8️⃣ AutoFP8 quantization
|
- 8️⃣ AutoFP8 quantization
|
||||||
- 🪓 GGUF splitting and merging
|
- 🪓 GGUF splitting and merging
|
||||||
- 🌐 HTTP API for automated monitoring
|
- 🌐 HTTP API for automation and monitoring
|
||||||
|
|
||||||
## Why AutoGGUF?
|
## Why AutoGGUF?
|
||||||
- Fast: Saves time on manual configuration
|
- Fast: Saves time on manual configuration
|
||||||
|
@ -138,17 +138,20 @@ ## Localizations
|
||||||
|
|
||||||
## Issues
|
## Issues
|
||||||
|
|
||||||
- Some inconsistent logging
|
- Some inconsistent logging and signal handling
|
||||||
- Missing translations
|
- Missing or duplicated translations
|
||||||
|
- Buggy/incomplete API interfaces
|
||||||
|
|
||||||
## Planned Features
|
## Planned Features
|
||||||
|
|
||||||
- Time estimation for quantization
|
- [ ] Time estimation for quantization
|
||||||
- Quantization file size estimate
|
- [ ] Quantization file size estimate
|
||||||
- Perplexity testing
|
- [ ] Perplexity testing
|
||||||
- bitsandbytes
|
- [ ] bitsandbytes support
|
||||||
|
|
||||||
Due to my limited availability and a lack of time, I won't be actively developing new features for this project as much. While I'll continue to publish builds from time to time, I strongly recommend running from source if you want to stay up to date with the latest changes. I'm still committed to keeping dependencies updated weekly and making small maintenance fixes to ensure everything runs smoothly. If you run into any problems or notice issues, please don't hesitate to let me know - I appreciate your feedback and will do my best to address them.
|
#### Project Status
|
||||||
|
|
||||||
|
AutoGGUF has now entered maintenance mode. It's considered stable and feature-complete for most use cases, so I'm not actively developing new features, but I’ll continue to publish occasional builds, update dependencies regularly, and fix critical bugs as needed. If you encounter issues or have suggestions, feel free to open an issue.
|
||||||
|
|
||||||
## Support
|
## Support
|
||||||
|
|
||||||
|
@ -162,3 +165,5 @@ ## Contributing
|
||||||
## Stargazers
|
## Stargazers
|
||||||
|
|
||||||
[](https://star-history.com/#leafspark/AutoGGUF&Date)
|
[](https://star-history.com/#leafspark/AutoGGUF&Date)
|
||||||
|
|
||||||
|
`Last Updated: 5/15/2025`
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
PyYAML~=6.0.2
|
PyYAML~=6.0.2
|
||||||
psutil~=7.0.0
|
psutil~=7.0.0
|
||||||
pynvml~=12.0.0
|
pynvml~=12.0.0
|
||||||
PySide6~=6.8.2
|
PySide6~=6.9.0
|
||||||
safetensors~=0.5.2
|
safetensors~=0.5.3
|
||||||
numpy<2.0.0
|
numpy<2.0.0
|
||||||
torch~=2.5.1
|
torch~=2.5.1
|
||||||
sentencepiece~=0.2.0
|
sentencepiece~=0.2.0
|
||||||
setuptools~=75.6.0
|
setuptools~=80.4.0
|
||||||
huggingface-hub~=0.29.2
|
huggingface-hub~=0.31.1
|
||||||
transformers~=4.48.0
|
transformers~=4.51.3
|
||||||
fastapi~=0.115.6
|
fastapi~=0.115.12
|
||||||
uvicorn~=0.34.0
|
uvicorn~=0.34.0
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -10,7 +10,7 @@
|
||||||
url="https://github.com/leafspark/AutoGGUF",
|
url="https://github.com/leafspark/AutoGGUF",
|
||||||
license="apache-2.0",
|
license="apache-2.0",
|
||||||
author="leafspark",
|
author="leafspark",
|
||||||
author_email="leafspark@duck.com",
|
author_email="leafspark@proton.me",
|
||||||
description="automatically quant GGUF models",
|
description="automatically quant GGUF models",
|
||||||
install_requires=required,
|
install_requires=required,
|
||||||
entry_points={"console_scripts": ["autogguf-gui = main:main"]},
|
entry_points={"console_scripts": ["autogguf-gui = main:main"]},
|
||||||
|
|
|
@ -1,11 +1,10 @@
|
||||||
import json
|
import json
|
||||||
import os
|
|
||||||
import shutil
|
import shutil
|
||||||
import urllib.error
|
import urllib.error
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from functools import partial, wraps
|
from functools import partial, wraps
|
||||||
from typing import Any, List, Union
|
from typing import List
|
||||||
|
|
||||||
from PySide6.QtCore import *
|
from PySide6.QtCore import *
|
||||||
from PySide6.QtGui import *
|
from PySide6.QtGui import *
|
||||||
|
@ -339,15 +338,15 @@ def __init__(self, args: List[str]) -> None:
|
||||||
output_layout.addWidget(output_button)
|
output_layout.addWidget(output_button)
|
||||||
self.merge_gguf_layout.addLayout(output_layout)
|
self.merge_gguf_layout.addLayout(output_layout)
|
||||||
|
|
||||||
# Split button
|
# Merge button
|
||||||
split_button = QPushButton(MERGE_GGUF)
|
merge_button = QPushButton(MERGE_GGUF)
|
||||||
split_button.clicked.connect(
|
merge_button.clicked.connect(
|
||||||
lambda: self.merge_gguf(
|
lambda: self.merge_gguf(
|
||||||
self.merge_gguf_input.text(),
|
self.merge_gguf_input.text(),
|
||||||
self.merge_gguf_output.text(),
|
self.merge_gguf_output.text(),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
self.merge_gguf_layout.addWidget(split_button)
|
self.merge_gguf_layout.addWidget(merge_button)
|
||||||
self.merge_gguf_dialog.setLayout(self.merge_gguf_layout)
|
self.merge_gguf_dialog.setLayout(self.merge_gguf_layout)
|
||||||
|
|
||||||
# HF Upload Window
|
# HF Upload Window
|
||||||
|
@ -763,7 +762,7 @@ def __init__(self, args: List[str]) -> None:
|
||||||
|
|
||||||
self.extra_arguments = QLineEdit()
|
self.extra_arguments = QLineEdit()
|
||||||
quant_options_layout.addRow(
|
quant_options_layout.addRow(
|
||||||
self.create_label(EXTRA_ARGUMENTS, EXTRA_COMMAND_ARGUMENTS),
|
self.create_label(EXTRA_ARGUMENTS, EXTRA_ARGUMENTS_LABEL),
|
||||||
self.extra_arguments,
|
self.extra_arguments,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1202,15 +1201,25 @@ def refresh_backends(self) -> None:
|
||||||
and "cudart-llama" not in item.lower()
|
and "cudart-llama" not in item.lower()
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def extract_b_val(name: str) -> int:
|
||||||
|
match = re.search(r"b(\d+)", name)
|
||||||
|
return int(match.group(1)) if match else -1
|
||||||
|
|
||||||
if valid_backends:
|
if valid_backends:
|
||||||
|
# Sort by newest version
|
||||||
|
valid_backends.sort(key=lambda x: extract_b_val(x[0]), reverse=True)
|
||||||
|
|
||||||
for name, path in valid_backends:
|
for name, path in valid_backends:
|
||||||
self.backend_combo.addItem(name, userData=path)
|
self.backend_combo.addItem(name, userData=path)
|
||||||
self.backend_combo.setEnabled(
|
|
||||||
True
|
self.backend_combo.setEnabled(True)
|
||||||
) # Enable the combo box if there are valid backends
|
|
||||||
|
# Selects the newest version (now at index 0)
|
||||||
|
self.backend_combo.setCurrentIndex(0)
|
||||||
else:
|
else:
|
||||||
self.backend_combo.addItem(NO_BACKENDS_AVAILABLE)
|
self.backend_combo.addItem(NO_BACKENDS_AVAILABLE)
|
||||||
self.backend_combo.setEnabled(False)
|
self.backend_combo.setEnabled(False)
|
||||||
|
|
||||||
self.logger.info(FOUND_VALID_BACKENDS.format(len(valid_backends)))
|
self.logger.info(FOUND_VALID_BACKENDS.format(len(valid_backends)))
|
||||||
|
|
||||||
def save_task_preset(self, task_item) -> None:
|
def save_task_preset(self, task_item) -> None:
|
||||||
|
@ -1252,13 +1261,13 @@ def download_finished(self, extract_dir) -> None:
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
QMessageBox.warning(
|
QMessageBox.warning(
|
||||||
self, CUDA_EXTRACTION_FAILED, NO_SUITABLE_CUDA_BACKEND_FOUND
|
self, CUDA_EXTRACTION_FAILED, NO_SUITABLE_CUDA_BACKEND_EXTRACTION
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
QMessageBox.information(
|
QMessageBox.information(
|
||||||
self,
|
self,
|
||||||
DOWNLOAD_COMPLETE,
|
DOWNLOAD_COMPLETE,
|
||||||
LLAMACPP_BINARY_DOWNLOADED_AND_EXTRACTED.format(extract_dir),
|
LLAMACPP_DOWNLOADED_AND_EXTRACTED.format(extract_dir),
|
||||||
)
|
)
|
||||||
|
|
||||||
self.refresh_backends() # Refresh the backends after successful download
|
self.refresh_backends() # Refresh the backends after successful download
|
||||||
|
@ -1906,12 +1915,25 @@ def show_task_details(self, item) -> None:
|
||||||
# Load existing content
|
# Load existing content
|
||||||
if os.path.exists(task_item.log_file):
|
if os.path.exists(task_item.log_file):
|
||||||
with open_file_safe(task_item.log_file, "r") as f:
|
with open_file_safe(task_item.log_file, "r") as f:
|
||||||
log_text.setPlainText(f.read())
|
content = f.read().rstrip("\n") # Remove trailing newlines
|
||||||
|
log_text.setPlainText(content)
|
||||||
|
|
||||||
|
# Scroll to the end
|
||||||
|
log_text.moveCursor(QTextCursor.End)
|
||||||
|
|
||||||
# Connect to the thread if it's still running
|
# Connect to the thread if it's still running
|
||||||
for thread in self.quant_threads:
|
for thread in self.quant_threads:
|
||||||
if thread.log_file == task_item.log_file:
|
if thread.log_file == task_item.log_file:
|
||||||
thread.output_signal.connect(log_text.appendPlainText)
|
# Create a local slot function that updates the text
|
||||||
|
def update_log(text):
|
||||||
|
log_text.appendPlainText(text)
|
||||||
|
log_text.moveCursor(QTextCursor.End)
|
||||||
|
|
||||||
|
thread.output_signal.connect(update_log)
|
||||||
|
# Disconnect the signal when the dialog is destroyed
|
||||||
|
log_dialog.destroyed.connect(
|
||||||
|
lambda: thread.output_signal.disconnect(update_log)
|
||||||
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
log_dialog.exec()
|
log_dialog.exec()
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
AUTOGGUF_VERSION = "v2.0.0"
|
AUTOGGUF_VERSION = "v2.0.1"
|
||||||
|
|
||||||
|
|
||||||
class _Localization:
|
class _Localization:
|
||||||
|
@ -53,13 +53,11 @@ def __init__(self):
|
||||||
self.QUANTIZE_TO_FP8_DYNAMIC = "Quantize to FP8 Dynamic"
|
self.QUANTIZE_TO_FP8_DYNAMIC = "Quantize to FP8 Dynamic"
|
||||||
self.OPEN_MODEL_FOLDER = "Open Model Folder"
|
self.OPEN_MODEL_FOLDER = "Open Model Folder"
|
||||||
self.QUANTIZE = "Quantize"
|
self.QUANTIZE = "Quantize"
|
||||||
self.OPEN_MODEL_FOLDER = "Open Model Folder"
|
|
||||||
self.INPUT_MODEL = "Input Model:"
|
self.INPUT_MODEL = "Input Model:"
|
||||||
|
|
||||||
# GGUF Verification
|
# GGUF Verification
|
||||||
self.INVALID_GGUF_FILE = "Invalid GGUF file: {}"
|
self.INVALID_GGUF_FILE = "Invalid GGUF file: {}"
|
||||||
self.SHARDED_MODEL_NAME = "{} (Sharded)"
|
self.SHARDED_MODEL_NAME = "{} (Sharded)"
|
||||||
self.IMPORTED_MODEL_TOOLTIP = "Imported model: {}"
|
|
||||||
self.CONCATENATED_FILE_WARNING = "This is a concatenated file part. It will not work with llama-quantize; please concat the file first."
|
self.CONCATENATED_FILE_WARNING = "This is a concatenated file part. It will not work with llama-quantize; please concat the file first."
|
||||||
self.CONCATENATED_FILES_FOUND = (
|
self.CONCATENATED_FILES_FOUND = (
|
||||||
"Found {} concatenated file parts. Please concat the files first."
|
"Found {} concatenated file parts. Please concat the files first."
|
||||||
|
@ -250,12 +248,6 @@ def __init__(self):
|
||||||
self.LLAMACPP_DOWNLOADED_AND_EXTRACTED = (
|
self.LLAMACPP_DOWNLOADED_AND_EXTRACTED = (
|
||||||
"llama.cpp binary downloaded and extracted to {0}"
|
"llama.cpp binary downloaded and extracted to {0}"
|
||||||
)
|
)
|
||||||
self.NO_SUITABLE_CUDA_BACKEND_FOUND = (
|
|
||||||
"No suitable CUDA backend found for extraction"
|
|
||||||
)
|
|
||||||
self.LLAMACPP_BINARY_DOWNLOADED_AND_EXTRACTED = (
|
|
||||||
"llama.cpp binary downloaded and extracted to {0}"
|
|
||||||
)
|
|
||||||
self.REFRESHING_LLAMACPP_RELEASES = "Refreshing llama.cpp releases"
|
self.REFRESHING_LLAMACPP_RELEASES = "Refreshing llama.cpp releases"
|
||||||
self.UPDATING_ASSET_LIST = "Updating asset list"
|
self.UPDATING_ASSET_LIST = "Updating asset list"
|
||||||
self.UPDATING_CUDA_OPTIONS = "Updating CUDA options"
|
self.UPDATING_CUDA_OPTIONS = "Updating CUDA options"
|
||||||
|
@ -454,7 +446,6 @@ def __init__(self):
|
||||||
self.UPLOAD = "Upload"
|
self.UPLOAD = "Upload"
|
||||||
self.INFO = "Info"
|
self.INFO = "Info"
|
||||||
|
|
||||||
self.EXTRA_COMMAND_ARGUMENTS = "Additional command-line arguments"
|
|
||||||
self.COPIED_COMMAND_TO_CLIPBOARD = "Copied command to clipboard:"
|
self.COPIED_COMMAND_TO_CLIPBOARD = "Copied command to clipboard:"
|
||||||
|
|
||||||
# Repository
|
# Repository
|
||||||
|
|
|
@ -104,20 +104,35 @@ def show_task_properties(self, item) -> None:
|
||||||
break
|
break
|
||||||
|
|
||||||
def cancel_task(self, item) -> None:
|
def cancel_task(self, item) -> None:
|
||||||
self.logger.info(CANCELLING_TASK.format(item.text()))
|
# TODO: fix possibly buggy signal behavior
|
||||||
task_item = self.task_list.itemWidget(item)
|
task_item = self.task_list.itemWidget(item)
|
||||||
|
if task_item:
|
||||||
|
task_name = task_item.task_name # Store the name before any changes
|
||||||
|
self.logger.info(CANCELLING_TASK.format(task_name))
|
||||||
|
|
||||||
|
# Find the thread and disconnect signals before terminating
|
||||||
for thread in self.quant_threads:
|
for thread in self.quant_threads:
|
||||||
if thread.log_file == task_item.log_file:
|
if thread.log_file == task_item.log_file:
|
||||||
|
# Disconnect all signals from this thread first
|
||||||
|
try:
|
||||||
|
thread.error_signal.disconnect() # Disconnect all error signal connections
|
||||||
|
thread.output_signal.disconnect() # Disconnect all output signal connections
|
||||||
|
except TypeError:
|
||||||
|
# No connections to disconnect
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Now terminate the thread
|
||||||
thread.terminate()
|
thread.terminate()
|
||||||
task_item.update_status(CANCELED)
|
|
||||||
self.quant_threads.remove(thread)
|
self.quant_threads.remove(thread)
|
||||||
break
|
break
|
||||||
|
|
||||||
def delete_task(self, item) -> None:
|
def delete_task(self, item) -> None:
|
||||||
self.logger.info(DELETING_TASK.format(item.text()))
|
task_item = self.task_list.itemWidget(item)
|
||||||
|
if not task_item:
|
||||||
|
return
|
||||||
|
|
||||||
# Cancel the task first
|
task_name = task_item.task_name # Store task_name before deletion
|
||||||
self.cancel_task(item)
|
self.logger.info(DELETING_TASK.format(task_name))
|
||||||
|
|
||||||
reply = QMessageBox.question(
|
reply = QMessageBox.question(
|
||||||
self,
|
self,
|
||||||
|
@ -126,12 +141,16 @@ def delete_task(self, item) -> None:
|
||||||
QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No,
|
QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No,
|
||||||
QMessageBox.StandardButton.No,
|
QMessageBox.StandardButton.No,
|
||||||
)
|
)
|
||||||
|
|
||||||
if reply == QMessageBox.StandardButton.Yes:
|
if reply == QMessageBox.StandardButton.Yes:
|
||||||
task_item = self.task_list.itemWidget(item)
|
# Cancel the task first (which disconnects signals)
|
||||||
|
self.cancel_task(item)
|
||||||
|
|
||||||
|
# Now remove from list and delete
|
||||||
row = self.task_list.row(item)
|
row = self.task_list.row(item)
|
||||||
self.task_list.takeItem(row)
|
self.task_list.takeItem(row)
|
||||||
|
|
||||||
if task_item:
|
# Delete the widget after removing from list
|
||||||
task_item.deleteLater()
|
task_item.deleteLater()
|
||||||
|
|
||||||
def update_status(self, status) -> None:
|
def update_status(self, status) -> None:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -24,10 +24,10 @@
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
from convert_hf_to_gguf import LazyTorchTensor, Model
|
# reuse model definitions from convert_hf_to_gguf.py
|
||||||
|
from convert_hf_to_gguf import LazyTorchTensor, ModelBase
|
||||||
|
|
||||||
logger = logging.getLogger("lora-to-gguf")
|
logger = logging.getLogger("lora-to-gguf")
|
||||||
|
|
||||||
|
@ -38,9 +38,10 @@ class PartialLoraTensor:
|
||||||
B: Tensor | None = None
|
B: Tensor | None = None
|
||||||
|
|
||||||
|
|
||||||
|
# magic to support tensor shape modifications and splitting
|
||||||
class LoraTorchTensor:
|
class LoraTorchTensor:
|
||||||
_lora_A: Tensor
|
_lora_A: Tensor # (n_rank, row_size)
|
||||||
_lora_B: Tensor
|
_lora_B: Tensor # (col_size, n_rank)
|
||||||
_rank: int
|
_rank: int
|
||||||
|
|
||||||
def __init__(self, A: Tensor, B: Tensor):
|
def __init__(self, A: Tensor, B: Tensor):
|
||||||
|
@ -58,14 +59,20 @@ def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
|
||||||
|
|
||||||
def __getitem__(
|
def __getitem__(
|
||||||
self,
|
self,
|
||||||
indices: SupportsIndex | slice | tuple[SupportsIndex | slice | Tensor, ...],
|
indices: (
|
||||||
|
SupportsIndex
|
||||||
|
| slice
|
||||||
|
| tuple[
|
||||||
|
SupportsIndex | slice | Tensor, ...
|
||||||
|
] # TODO: add ellipsis in the type signature
|
||||||
|
),
|
||||||
) -> LoraTorchTensor:
|
) -> LoraTorchTensor:
|
||||||
shape = self.shape
|
shape = self.shape
|
||||||
if isinstance(indices, SupportsIndex):
|
if isinstance(indices, SupportsIndex):
|
||||||
if len(shape) > 2:
|
if len(shape) > 2:
|
||||||
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError # can't return a vector
|
||||||
elif isinstance(indices, slice):
|
elif isinstance(indices, slice):
|
||||||
if len(shape) > 2:
|
if len(shape) > 2:
|
||||||
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
|
||||||
|
@ -75,7 +82,7 @@ def __getitem__(
|
||||||
assert len(indices) > 0
|
assert len(indices) > 0
|
||||||
if indices[-1] is Ellipsis:
|
if indices[-1] is Ellipsis:
|
||||||
return self[indices[:-1]]
|
return self[indices[:-1]]
|
||||||
|
# expand ellipsis
|
||||||
indices = tuple(
|
indices = tuple(
|
||||||
u
|
u
|
||||||
for v in (
|
for v in (
|
||||||
|
@ -95,6 +102,7 @@ def __getitem__(
|
||||||
*(slice(None, None) for _ in range(len(indices), len(shape))),
|
*(slice(None, None) for _ in range(len(indices), len(shape))),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# TODO: make sure this is correct
|
||||||
indices_A = (
|
indices_A = (
|
||||||
*(
|
*(
|
||||||
(
|
(
|
||||||
|
@ -110,7 +118,7 @@ def __getitem__(
|
||||||
indices_B = indices[:-1]
|
indices_B = indices[:-1]
|
||||||
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
|
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError # unknown indice type
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dtype(self) -> torch.dtype:
|
def dtype(self) -> torch.dtype:
|
||||||
|
@ -133,8 +141,9 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
|
||||||
new_shape = cast(tuple[int, ...], shape)
|
new_shape = cast(tuple[int, ...], shape)
|
||||||
orig_shape = self.shape
|
orig_shape = self.shape
|
||||||
if len(new_shape) < 2:
|
if len(new_shape) < 2:
|
||||||
raise NotImplementedError
|
raise NotImplementedError # can't become a vector
|
||||||
|
|
||||||
|
# expand -1 in the shape
|
||||||
if any(dim == -1 for dim in new_shape):
|
if any(dim == -1 for dim in new_shape):
|
||||||
n_elems = prod(orig_shape)
|
n_elems = prod(orig_shape)
|
||||||
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
|
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
|
||||||
|
@ -144,7 +153,7 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
|
||||||
)
|
)
|
||||||
|
|
||||||
if new_shape[-1] != orig_shape[-1]:
|
if new_shape[-1] != orig_shape[-1]:
|
||||||
raise NotImplementedError
|
raise NotImplementedError # can't reshape the row size trivially
|
||||||
|
|
||||||
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
|
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
|
||||||
shape_B = (*new_shape[:-1], self._rank)
|
shape_B = (*new_shape[:-1], self._rank)
|
||||||
|
@ -163,7 +172,7 @@ def permute(self, *dims: int) -> LoraTorchTensor:
|
||||||
shape = self.shape
|
shape = self.shape
|
||||||
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
|
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
|
||||||
if dims[-1] == -1:
|
if dims[-1] == -1:
|
||||||
|
# TODO: support higher dimensional A shapes bigger than 1
|
||||||
assert all(dim == 1 for dim in self._lora_A.shape[:-2])
|
assert all(dim == 1 for dim in self._lora_A.shape[:-2])
|
||||||
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
|
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
|
||||||
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
|
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
|
||||||
|
@ -171,7 +180,7 @@ def permute(self, *dims: int) -> LoraTorchTensor:
|
||||||
self._lora_B.permute(*dims), self._lora_A.permute(*dims)
|
self._lora_B.permute(*dims), self._lora_A.permute(*dims)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
# TODO: compose the above two
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
|
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
|
||||||
|
@ -190,7 +199,7 @@ def to(self, *args, **kwargs):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
|
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
|
||||||
del types
|
del types # unused
|
||||||
|
|
||||||
if kwargs is None:
|
if kwargs is None:
|
||||||
kwargs = {}
|
kwargs = {}
|
||||||
|
@ -231,7 +240,7 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
|
||||||
base_name = lora_tensor_name.replace("base_model.model.", "")
|
base_name = lora_tensor_name.replace("base_model.model.", "")
|
||||||
base_name = base_name.replace(".lora_A.weight", ".weight")
|
base_name = base_name.replace(".lora_A.weight", ".weight")
|
||||||
base_name = base_name.replace(".lora_B.weight", ".weight")
|
base_name = base_name.replace(".lora_B.weight", ".weight")
|
||||||
|
# models produced by mergekit-extract-lora have token embeddings in the adapter
|
||||||
base_name = base_name.replace(".lora_embedding_A", ".weight")
|
base_name = base_name.replace(".lora_embedding_A", ".weight")
|
||||||
base_name = base_name.replace(".lora_embedding_B", ".weight")
|
base_name = base_name.replace(".lora_embedding_B", ".weight")
|
||||||
return base_name
|
return base_name
|
||||||
|
@ -293,7 +302,7 @@ def parse_args() -> argparse.Namespace:
|
||||||
|
|
||||||
|
|
||||||
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
||||||
|
# normally, adapter does not come with base model config, we need to load it from AutoConfig
|
||||||
config = AutoConfig.from_pretrained(hf_model_id)
|
config = AutoConfig.from_pretrained(hf_model_id)
|
||||||
return config.to_dict()
|
return config.to_dict()
|
||||||
|
|
||||||
|
@ -321,11 +330,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
||||||
if args.outfile is not None:
|
if args.outfile is not None:
|
||||||
fname_out = args.outfile
|
fname_out = args.outfile
|
||||||
else:
|
else:
|
||||||
|
# output in the same directory as the model by default
|
||||||
fname_out = dir_lora
|
fname_out = dir_lora
|
||||||
|
|
||||||
if os.path.exists(input_model):
|
if os.path.exists(input_model):
|
||||||
|
# lazy import load_file only if lora is in safetensors format.
|
||||||
from safetensors.torch import load_file
|
from safetensors.torch import load_file
|
||||||
|
|
||||||
lora_model = load_file(input_model, device="cpu")
|
lora_model = load_file(input_model, device="cpu")
|
||||||
|
@ -333,9 +342,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
||||||
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
input_model = os.path.join(dir_lora, "adapter_model.bin")
|
||||||
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
|
||||||
|
|
||||||
|
# load LoRA config
|
||||||
with open(lora_config, "r") as f:
|
with open(lora_config, "r") as f:
|
||||||
lparams: dict[str, Any] = json.load(f)
|
lparams: dict[str, Any] = json.load(f)
|
||||||
|
|
||||||
|
# load base model
|
||||||
if base_model_id is not None:
|
if base_model_id is not None:
|
||||||
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
|
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
|
||||||
hparams = load_hparams_from_hf(base_model_id)
|
hparams = load_hparams_from_hf(base_model_id)
|
||||||
|
@ -361,11 +372,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
else:
|
else:
|
||||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||||
hparams = Model.load_hparams(dir_base_model)
|
hparams = ModelBase.load_hparams(dir_base_model)
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
try:
|
try:
|
||||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
model_class = ModelBase.from_model_architecture(hparams["architectures"][0])
|
||||||
except NotImplementedError:
|
except NotImplementedError:
|
||||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
@ -397,7 +408,7 @@ def set_gguf_parameters(self):
|
||||||
)
|
)
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
|
@ -407,13 +418,13 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
if self.lazy:
|
if self.lazy:
|
||||||
tensor = LazyTorchTensor.from_eager(tensor)
|
tensor = LazyTorchTensor.from_eager(tensor)
|
||||||
base_name = get_base_tensor_name(name)
|
base_name = get_base_tensor_name(name)
|
||||||
|
# note: mergekit-extract-lora also adds token embeddings to the adapter
|
||||||
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
|
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
|
||||||
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
|
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
|
||||||
if not is_lora_a and not is_lora_b:
|
if not is_lora_a and not is_lora_b:
|
||||||
if ".base_layer.weight" in name:
|
if ".base_layer.weight" in name:
|
||||||
continue
|
continue
|
||||||
|
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
|
||||||
if "_layernorm" in name or ".norm" in name:
|
if "_layernorm" in name or ".norm" in name:
|
||||||
yield (base_name, tensor)
|
yield (base_name, tensor)
|
||||||
continue
|
continue
|
||||||
|
@ -452,21 +463,27 @@ def modify_tensors(
|
||||||
self, data_torch: Tensor, name: str, bid: int | None
|
self, data_torch: Tensor, name: str, bid: int | None
|
||||||
) -> Iterable[tuple[str, Tensor]]:
|
) -> Iterable[tuple[str, Tensor]]:
|
||||||
dest = list(super().modify_tensors(data_torch, name, bid))
|
dest = list(super().modify_tensors(data_torch, name, bid))
|
||||||
|
# some archs may have the same tensor for lm_head and output (tie word embeddings)
|
||||||
|
# in this case, adapters targeting lm_head will fail when using llama-export-lora
|
||||||
|
# therefore, we ignore them for now
|
||||||
|
# see: https://github.com/ggml-org/llama.cpp/issues/9065
|
||||||
if name == "lm_head.weight" and len(dest) == 0:
|
if name == "lm_head.weight" and len(dest) == 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"lm_head is present in adapter, but is ignored in base model"
|
"lm_head is present in adapter, but is ignored in base model"
|
||||||
)
|
)
|
||||||
for dest_name, dest_data in dest:
|
for dest_name, dest_data in dest:
|
||||||
|
# mergekit-extract-lora add these layernorm to the adapter
|
||||||
if "_norm" in dest_name:
|
if "_norm" in dest_name:
|
||||||
assert dest_data.dim() == 1
|
assert dest_data.dim() == 1
|
||||||
yield (dest_name, dest_data)
|
yield (dest_name, dest_data)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# otherwise, we must get the lora_A and lora_B tensors
|
||||||
assert isinstance(dest_data, LoraTorchTensor)
|
assert isinstance(dest_data, LoraTorchTensor)
|
||||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||||
|
|
||||||
|
# note: mergekit-extract-lora flip and transpose A and B
|
||||||
|
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
|
||||||
if "token_embd.weight" in dest_name:
|
if "token_embd.weight" in dest_name:
|
||||||
lora_a = lora_a.T
|
lora_a = lora_a.T
|
||||||
|
|
||||||
|
|
|
@ -108,6 +108,7 @@ class LLM:
|
||||||
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
||||||
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
|
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
|
||||||
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
|
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
|
||||||
|
MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers"
|
||||||
POOLING_TYPE = "{arch}.pooling_type"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
LOGIT_SCALE = "{arch}.logit_scale"
|
LOGIT_SCALE = "{arch}.logit_scale"
|
||||||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||||
|
@ -120,6 +121,7 @@ class LLM:
|
||||||
RESIDUAL_SCALE = "{arch}.residual_scale"
|
RESIDUAL_SCALE = "{arch}.residual_scale"
|
||||||
EMBEDDING_SCALE = "{arch}.embedding_scale"
|
EMBEDDING_SCALE = "{arch}.embedding_scale"
|
||||||
TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
|
TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
|
||||||
|
INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -142,6 +144,8 @@ class Attention:
|
||||||
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
||||||
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
||||||
SCALE = "{arch}.attention.scale"
|
SCALE = "{arch}.attention.scale"
|
||||||
|
KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
|
||||||
|
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
|
@ -221,6 +225,30 @@ class Adapter:
|
||||||
TYPE = "adapter.type"
|
TYPE = "adapter.type"
|
||||||
LORA_ALPHA = "adapter.lora.alpha"
|
LORA_ALPHA = "adapter.lora.alpha"
|
||||||
|
|
||||||
|
class ClipVision:
|
||||||
|
PROJECTOR_TYPE = "clip.projector_type"
|
||||||
|
HAS_VISION_ENCODER = "clip.has_vision_encoder"
|
||||||
|
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
|
||||||
|
IMAGE_SIZE = "clip.vision.image_size"
|
||||||
|
PATCH_SIZE = "clip.vision.patch_size"
|
||||||
|
EMBEDDING_LENGTH = "clip.vision.embedding_length"
|
||||||
|
FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length"
|
||||||
|
PROJECTION_DIM = "clip.vision.projection_dim"
|
||||||
|
BLOCK_COUNT = "clip.vision.block_count"
|
||||||
|
IMAGE_MEAN = "clip.vision.image_mean"
|
||||||
|
IMAGE_STD = "clip.vision.image_std"
|
||||||
|
SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size"
|
||||||
|
USE_GELU = "clip.use_gelu"
|
||||||
|
USE_SILU = "clip.use_silu"
|
||||||
|
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
|
||||||
|
|
||||||
|
class Attention:
|
||||||
|
HEAD_COUNT = "clip.vision.attention.head_count"
|
||||||
|
LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon"
|
||||||
|
|
||||||
|
class Projector:
|
||||||
|
SCALE_FACTOR = "clip.vision.projector.scale_factor"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# recommended mapping of model tensor names for storage in gguf
|
# recommended mapping of model tensor names for storage in gguf
|
||||||
|
@ -230,10 +258,13 @@ class Adapter:
|
||||||
class GGUFType:
|
class GGUFType:
|
||||||
MODEL = "model"
|
MODEL = "model"
|
||||||
ADAPTER = "adapter"
|
ADAPTER = "adapter"
|
||||||
|
CLIP_VISION = "clip-vision"
|
||||||
|
|
||||||
|
|
||||||
class MODEL_ARCH(IntEnum):
|
class MODEL_ARCH(IntEnum):
|
||||||
|
CLIP_VISION = auto() # dummy arch for clip.cpp
|
||||||
LLAMA = auto()
|
LLAMA = auto()
|
||||||
|
LLAMA4 = auto()
|
||||||
DECI = auto()
|
DECI = auto()
|
||||||
FALCON = auto()
|
FALCON = auto()
|
||||||
BAICHUAN = auto()
|
BAICHUAN = auto()
|
||||||
|
@ -246,6 +277,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
REFACT = auto()
|
REFACT = auto()
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
NOMIC_BERT = auto()
|
NOMIC_BERT = auto()
|
||||||
|
NOMIC_BERT_MOE = auto()
|
||||||
JINA_BERT_V2 = auto()
|
JINA_BERT_V2 = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
|
@ -253,6 +285,8 @@ class MODEL_ARCH(IntEnum):
|
||||||
QWEN2 = auto()
|
QWEN2 = auto()
|
||||||
QWEN2MOE = auto()
|
QWEN2MOE = auto()
|
||||||
QWEN2VL = auto()
|
QWEN2VL = auto()
|
||||||
|
QWEN3 = auto()
|
||||||
|
QWEN3MOE = auto()
|
||||||
PHI2 = auto()
|
PHI2 = auto()
|
||||||
PHI3 = auto()
|
PHI3 = auto()
|
||||||
PHIMOE = auto()
|
PHIMOE = auto()
|
||||||
|
@ -283,6 +317,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
DEEPSEEK = auto()
|
DEEPSEEK = auto()
|
||||||
DEEPSEEK2 = auto()
|
DEEPSEEK2 = auto()
|
||||||
CHATGLM = auto()
|
CHATGLM = auto()
|
||||||
|
GLM4 = auto()
|
||||||
BITNET = auto()
|
BITNET = auto()
|
||||||
T5 = auto()
|
T5 = auto()
|
||||||
T5ENCODER = auto()
|
T5ENCODER = auto()
|
||||||
|
@ -293,6 +328,18 @@ class MODEL_ARCH(IntEnum):
|
||||||
GRANITE_MOE = auto()
|
GRANITE_MOE = auto()
|
||||||
CHAMELEON = auto()
|
CHAMELEON = auto()
|
||||||
WAVTOKENIZER_DEC = auto()
|
WAVTOKENIZER_DEC = auto()
|
||||||
|
PLM = auto()
|
||||||
|
BAILINGMOE = auto()
|
||||||
|
|
||||||
|
|
||||||
|
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||||
|
MLP = auto()
|
||||||
|
LDP = auto()
|
||||||
|
LDPV2 = auto()
|
||||||
|
RESAMPLER = auto()
|
||||||
|
GLM_EDGE = auto()
|
||||||
|
MERGER = auto()
|
||||||
|
GEMMA3 = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -382,6 +429,8 @@ class MODEL_TENSOR(IntEnum):
|
||||||
ATTN_Q_B = auto()
|
ATTN_Q_B = auto()
|
||||||
ATTN_KV_A_MQA = auto()
|
ATTN_KV_A_MQA = auto()
|
||||||
ATTN_KV_B = auto()
|
ATTN_KV_B = auto()
|
||||||
|
ATTN_K_B = auto()
|
||||||
|
ATTN_V_B = auto()
|
||||||
ATTN_Q_A_NORM = auto()
|
ATTN_Q_A_NORM = auto()
|
||||||
ATTN_KV_A_NORM = auto()
|
ATTN_KV_A_NORM = auto()
|
||||||
FFN_SUB_NORM = auto()
|
FFN_SUB_NORM = auto()
|
||||||
|
@ -432,10 +481,51 @@ class MODEL_TENSOR(IntEnum):
|
||||||
POSNET_ATTN_K = auto()
|
POSNET_ATTN_K = auto()
|
||||||
POSNET_ATTN_V = auto()
|
POSNET_ATTN_V = auto()
|
||||||
POSNET_ATTN_OUT = auto()
|
POSNET_ATTN_OUT = auto()
|
||||||
|
# vision
|
||||||
|
V_MMPROJ = auto()
|
||||||
|
V_MMPROJ_FC = auto()
|
||||||
|
V_MMPROJ_MLP = auto()
|
||||||
|
V_MMPROJ_PEG = auto()
|
||||||
|
V_ENC_EMBD_CLS = auto()
|
||||||
|
V_ENC_EMBD_PATCH = auto()
|
||||||
|
V_ENC_EMBD_POS = auto()
|
||||||
|
V_ENC_ATTN_Q = auto()
|
||||||
|
V_ENC_ATTN_Q_NORM = auto()
|
||||||
|
V_ENC_ATTN_K = auto()
|
||||||
|
V_ENC_ATTN_K_NORM = auto()
|
||||||
|
V_ENC_ATTN_V = auto()
|
||||||
|
V_ENC_INPUT_NORM = auto()
|
||||||
|
V_ENC_OUTPUT = auto()
|
||||||
|
V_ENC_OUTPUT_NORM = auto()
|
||||||
|
V_ENC_FFN_UP = auto()
|
||||||
|
V_ENC_FFN_GATE = auto()
|
||||||
|
V_ENC_FFN_DOWN = auto()
|
||||||
|
V_LAYER_SCALE_1 = auto()
|
||||||
|
V_LAYER_SCALE_2 = auto()
|
||||||
|
V_PRE_NORM = auto()
|
||||||
|
V_POST_NORM = auto()
|
||||||
|
V_MM_INP_NORM = auto()
|
||||||
|
V_MM_INP_PROJ = auto() # gemma3
|
||||||
|
V_MM_SOFT_EMB_NORM = auto() # gemma3
|
||||||
|
V_RESMPL_POS_EMBD_K = auto() # minicpmv
|
||||||
|
V_RESMPL_ATTN_Q = auto() # minicpmv
|
||||||
|
V_RESMPL_ATTN_K = auto() # minicpmv
|
||||||
|
V_RESMPL_ATTN_V = auto() # minicpmv
|
||||||
|
V_RESMPL_ATTN_OUT = auto() # minicpmv
|
||||||
|
V_RESMPL_KV = auto() # minicpmv
|
||||||
|
V_RESMPL_KV_NORM = auto() # minicpmv
|
||||||
|
V_RESMPL_POST_NORM = auto() # minicpmv
|
||||||
|
V_RESMPL_Q_NORM = auto() # minicpmv
|
||||||
|
V_RESMPL_PROJ = auto() # minicpmv
|
||||||
|
V_RESMPL_QUERY = auto() # minicpmv
|
||||||
|
V_TOK_EMBD_IMG_BREAK = auto() # pixtral
|
||||||
|
V_MM_PATCH_MERGER = auto() # mistral small 3.1
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
MODEL_ARCH.CLIP_VISION: "clip", # dummy arch for clip.cpp
|
||||||
MODEL_ARCH.LLAMA: "llama",
|
MODEL_ARCH.LLAMA: "llama",
|
||||||
|
MODEL_ARCH.LLAMA4: "llama4",
|
||||||
MODEL_ARCH.DECI: "deci",
|
MODEL_ARCH.DECI: "deci",
|
||||||
MODEL_ARCH.FALCON: "falcon",
|
MODEL_ARCH.FALCON: "falcon",
|
||||||
MODEL_ARCH.BAICHUAN: "baichuan",
|
MODEL_ARCH.BAICHUAN: "baichuan",
|
||||||
|
@ -448,6 +538,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_ARCH.REFACT: "refact",
|
MODEL_ARCH.REFACT: "refact",
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||||
|
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
|
||||||
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
|
@ -455,6 +546,8 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_ARCH.QWEN2: "qwen2",
|
MODEL_ARCH.QWEN2: "qwen2",
|
||||||
MODEL_ARCH.QWEN2MOE: "qwen2moe",
|
MODEL_ARCH.QWEN2MOE: "qwen2moe",
|
||||||
MODEL_ARCH.QWEN2VL: "qwen2vl",
|
MODEL_ARCH.QWEN2VL: "qwen2vl",
|
||||||
|
MODEL_ARCH.QWEN3: "qwen3",
|
||||||
|
MODEL_ARCH.QWEN3MOE: "qwen3moe",
|
||||||
MODEL_ARCH.PHI2: "phi2",
|
MODEL_ARCH.PHI2: "phi2",
|
||||||
MODEL_ARCH.PHI3: "phi3",
|
MODEL_ARCH.PHI3: "phi3",
|
||||||
MODEL_ARCH.PHIMOE: "phimoe",
|
MODEL_ARCH.PHIMOE: "phimoe",
|
||||||
|
@ -485,6 +578,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_ARCH.DEEPSEEK: "deepseek",
|
MODEL_ARCH.DEEPSEEK: "deepseek",
|
||||||
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||||
MODEL_ARCH.CHATGLM: "chatglm",
|
MODEL_ARCH.CHATGLM: "chatglm",
|
||||||
|
MODEL_ARCH.GLM4: "glm4",
|
||||||
MODEL_ARCH.BITNET: "bitnet",
|
MODEL_ARCH.BITNET: "bitnet",
|
||||||
MODEL_ARCH.T5: "t5",
|
MODEL_ARCH.T5: "t5",
|
||||||
MODEL_ARCH.T5ENCODER: "t5encoder",
|
MODEL_ARCH.T5ENCODER: "t5encoder",
|
||||||
|
@ -495,6 +589,18 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_ARCH.GRANITE_MOE: "granitemoe",
|
MODEL_ARCH.GRANITE_MOE: "granitemoe",
|
||||||
MODEL_ARCH.CHAMELEON: "chameleon",
|
MODEL_ARCH.CHAMELEON: "chameleon",
|
||||||
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
|
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
|
||||||
|
MODEL_ARCH.PLM: "plm",
|
||||||
|
MODEL_ARCH.BAILINGMOE: "bailingmoe",
|
||||||
|
}
|
||||||
|
|
||||||
|
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||||
|
VISION_PROJECTOR_TYPE.MLP: "mlp",
|
||||||
|
VISION_PROJECTOR_TYPE.LDP: "ldp",
|
||||||
|
VISION_PROJECTOR_TYPE.LDPV2: "ldpv2",
|
||||||
|
VISION_PROJECTOR_TYPE.RESAMPLER: "resampler",
|
||||||
|
VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter",
|
||||||
|
VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger",
|
||||||
|
VISION_PROJECTOR_TYPE.GEMMA3: "gemma3",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -584,6 +690,8 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
||||||
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
||||||
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
||||||
|
MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b",
|
||||||
|
MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b",
|
||||||
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
||||||
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
||||||
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
||||||
|
@ -634,9 +742,88 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
|
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
|
||||||
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
|
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
|
||||||
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
|
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
|
||||||
|
# vision
|
||||||
|
MODEL_TENSOR.V_MMPROJ: "mm.{bid}",
|
||||||
|
MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc",
|
||||||
|
MODEL_TENSOR.V_MMPROJ_MLP: "mm.model.mlp.{bid}",
|
||||||
|
MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}",
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd",
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm",
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k",
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_K_NORM: "v.blk.{bid}.attn_k_norm",
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v",
|
||||||
|
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1",
|
||||||
|
MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out",
|
||||||
|
MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.blk.{bid}.ln2",
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1",
|
||||||
|
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
|
||||||
|
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
||||||
|
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
||||||
|
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
|
||||||
|
MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm",
|
||||||
|
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
|
||||||
|
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k",
|
||||||
|
MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q",
|
||||||
|
MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k",
|
||||||
|
MODEL_TENSOR.V_RESMPL_ATTN_V: "resampler.attn.v",
|
||||||
|
MODEL_TENSOR.V_RESMPL_ATTN_OUT: "resampler.attn.out",
|
||||||
|
MODEL_TENSOR.V_RESMPL_KV: "resampler.kv",
|
||||||
|
MODEL_TENSOR.V_RESMPL_KV_NORM: "resampler.ln_kv",
|
||||||
|
MODEL_TENSOR.V_RESMPL_POST_NORM: "resampler.ln_post",
|
||||||
|
MODEL_TENSOR.V_RESMPL_Q_NORM: "resampler.ln_q",
|
||||||
|
MODEL_TENSOR.V_RESMPL_PROJ: "resampler.proj",
|
||||||
|
MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query",
|
||||||
|
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral
|
||||||
|
MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
MODEL_ARCH.CLIP_VISION: [
|
||||||
|
MODEL_TENSOR.V_MMPROJ,
|
||||||
|
MODEL_TENSOR.V_MMPROJ_FC,
|
||||||
|
MODEL_TENSOR.V_MMPROJ_MLP,
|
||||||
|
MODEL_TENSOR.V_MMPROJ_PEG,
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_CLS,
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_PATCH,
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_POS,
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_Q,
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_K,
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_V,
|
||||||
|
MODEL_TENSOR.V_ENC_INPUT_NORM,
|
||||||
|
MODEL_TENSOR.V_ENC_OUTPUT,
|
||||||
|
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_UP,
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_GATE,
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_DOWN,
|
||||||
|
MODEL_TENSOR.V_LAYER_SCALE_1,
|
||||||
|
MODEL_TENSOR.V_LAYER_SCALE_2,
|
||||||
|
MODEL_TENSOR.V_PRE_NORM,
|
||||||
|
MODEL_TENSOR.V_POST_NORM,
|
||||||
|
MODEL_TENSOR.V_MM_INP_PROJ,
|
||||||
|
MODEL_TENSOR.V_MM_INP_NORM,
|
||||||
|
MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
|
||||||
|
MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
|
||||||
|
MODEL_TENSOR.V_RESMPL_ATTN_Q,
|
||||||
|
MODEL_TENSOR.V_RESMPL_ATTN_K,
|
||||||
|
MODEL_TENSOR.V_RESMPL_ATTN_V,
|
||||||
|
MODEL_TENSOR.V_RESMPL_ATTN_OUT,
|
||||||
|
MODEL_TENSOR.V_RESMPL_KV,
|
||||||
|
MODEL_TENSOR.V_RESMPL_KV_NORM,
|
||||||
|
MODEL_TENSOR.V_RESMPL_POST_NORM,
|
||||||
|
MODEL_TENSOR.V_RESMPL_Q_NORM,
|
||||||
|
MODEL_TENSOR.V_RESMPL_PROJ,
|
||||||
|
MODEL_TENSOR.V_RESMPL_QUERY,
|
||||||
|
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
|
||||||
|
MODEL_TENSOR.V_MM_PATCH_MERGER,
|
||||||
|
],
|
||||||
MODEL_ARCH.LLAMA: [
|
MODEL_ARCH.LLAMA: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -657,6 +844,29 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
MODEL_TENSOR.FFN_UP_EXP,
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.LLAMA4: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
|
],
|
||||||
MODEL_ARCH.DECI: [
|
MODEL_ARCH.DECI: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -780,6 +990,22 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.NOMIC_BERT_MOE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
|
],
|
||||||
MODEL_ARCH.JINA_BERT_V2: [
|
MODEL_ARCH.JINA_BERT_V2: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
@ -930,6 +1156,40 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.QWEN3: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.QWEN3MOE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
],
|
||||||
MODEL_ARCH.PLAMO: [
|
MODEL_ARCH.PLAMO: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -1120,6 +1380,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
],
|
],
|
||||||
MODEL_ARCH.GEMMA3: [
|
MODEL_ARCH.GEMMA3: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
MODEL_TENSOR.ATTN_Q,
|
MODEL_TENSOR.ATTN_Q,
|
||||||
MODEL_TENSOR.ATTN_Q_NORM,
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
@ -1453,6 +1714,8 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.ATTN_Q_B,
|
MODEL_TENSOR.ATTN_Q_B,
|
||||||
MODEL_TENSOR.ATTN_KV_A_MQA,
|
MODEL_TENSOR.ATTN_KV_A_MQA,
|
||||||
MODEL_TENSOR.ATTN_KV_B,
|
MODEL_TENSOR.ATTN_KV_B,
|
||||||
|
MODEL_TENSOR.ATTN_K_B,
|
||||||
|
MODEL_TENSOR.ATTN_V_B,
|
||||||
MODEL_TENSOR.ATTN_Q_A_NORM,
|
MODEL_TENSOR.ATTN_Q_A_NORM,
|
||||||
MODEL_TENSOR.ATTN_KV_A_NORM,
|
MODEL_TENSOR.ATTN_KV_A_NORM,
|
||||||
MODEL_TENSOR.ATTN_OUT,
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
@ -1470,6 +1733,20 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.PLM: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_MQA,
|
||||||
|
MODEL_TENSOR.ATTN_KV_A_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_KV_B,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
],
|
||||||
MODEL_ARCH.CHATGLM: [
|
MODEL_ARCH.CHATGLM: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
@ -1485,6 +1762,23 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.GLM4: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_POST_NORM,
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM,
|
||||||
|
],
|
||||||
MODEL_ARCH.BITNET: [
|
MODEL_ARCH.BITNET: [
|
||||||
MODEL_TENSOR.ATTN_Q,
|
MODEL_TENSOR.ATTN_Q,
|
||||||
MODEL_TENSOR.ATTN_K,
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
@ -1618,6 +1912,9 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.FFN_GATE_EXP,
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
MODEL_TENSOR.FFN_UP_EXP,
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.CHAMELEON: [
|
MODEL_ARCH.CHAMELEON: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
@ -1657,6 +1954,25 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.POSNET_ATTN_V,
|
MODEL_TENSOR.POSNET_ATTN_V,
|
||||||
MODEL_TENSOR.POSNET_ATTN_OUT,
|
MODEL_TENSOR.POSNET_ATTN_OUT,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.BAILINGMOE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1709,6 +2025,9 @@ class MODEL_TENSOR(IntEnum):
|
||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.BAILINGMOE: [
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -1736,6 +2055,8 @@ class PoolingType(IntEnum):
|
||||||
NONE = 0
|
NONE = 0
|
||||||
MEAN = 1
|
MEAN = 1
|
||||||
CLS = 2
|
CLS = 2
|
||||||
|
LAST = 3
|
||||||
|
RANK = 4
|
||||||
|
|
||||||
|
|
||||||
class GGMLQuantizationType(IntEnum):
|
class GGMLQuantizationType(IntEnum):
|
||||||
|
@ -1862,6 +2183,15 @@ def get_type(val: Any) -> GGUFValueType:
|
||||||
raise ValueError(f"Unknown type: {type(val)}")
|
raise ValueError(f"Unknown type: {type(val)}")
|
||||||
|
|
||||||
|
|
||||||
|
class VisionProjectorType:
|
||||||
|
GEMMA3 = "gemma3"
|
||||||
|
IDEFICS3 = "idefics3"
|
||||||
|
PIXTRAL = "pixtral"
|
||||||
|
QWEN2VL = "qwen2vl_merger"
|
||||||
|
QWEN25VL = "qwen2.5vl_merger"
|
||||||
|
INTERNVL = "internvl"
|
||||||
|
|
||||||
|
|
||||||
# Items here are (block size, type size)
|
# Items here are (block size, type size)
|
||||||
QK_K = 256
|
QK_K = 256
|
||||||
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# This file left for compatibility. If you want to use the GGUF API from Python
|
|
||||||
# then don't import gguf/gguf.py directly. If you're looking for examples, see the
|
|
||||||
# examples/ directory for gguf-py
|
|
||||||
|
|
||||||
import importlib
|
import importlib
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
#
|
|
||||||
# GGUF file reading/modification support. For API usage information,
|
|
||||||
# please see the files scripts/ for some fairly simple examples.
|
|
||||||
#
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
|
@ -774,6 +774,12 @@ def add_key_length(self, length: int) -> None:
|
||||||
def add_value_length(self, length: int) -> None:
|
def add_value_length(self, length: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length)
|
self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_key_length_mla(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.KEY_LENGTH_MLA.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
def add_value_length_mla(self, length: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length)
|
||||||
|
|
||||||
def add_max_alibi_bias(self, bias: float) -> None:
|
def add_max_alibi_bias(self, bias: float) -> None:
|
||||||
self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)
|
self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)
|
||||||
|
|
||||||
|
@ -807,6 +813,9 @@ def add_expert_weights_norm(self, value: bool) -> None:
|
||||||
def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
|
def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
|
||||||
self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
|
self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
def add_moe_every_n_layers(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_swin_norm(self, value: bool) -> None:
|
def add_swin_norm(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
|
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
@ -831,6 +840,11 @@ def add_wkv_head_size(self, size: int) -> None:
|
||||||
def add_token_shift_count(self, count: int) -> None:
|
def add_token_shift_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_interleave_moe_layer_step(self, value: int) -> None:
|
||||||
|
self.add_uint32(
|
||||||
|
Keys.LLM.INTERLEAVE_MOE_LAYER_STEP.format(arch=self.arch), value
|
||||||
|
)
|
||||||
|
|
||||||
def add_layer_norm_eps(self, value: float) -> None:
|
def add_layer_norm_eps(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
@ -1017,6 +1031,59 @@ def add_eot_token_id(self, id: int) -> None:
|
||||||
def add_eom_token_id(self, id: int) -> None:
|
def add_eom_token_id(self, id: int) -> None:
|
||||||
self.add_uint32(Keys.Tokenizer.EOM_ID, id)
|
self.add_uint32(Keys.Tokenizer.EOM_ID, id)
|
||||||
|
|
||||||
|
# for vision models
|
||||||
|
|
||||||
|
def add_vision_projection_dim(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)
|
||||||
|
|
||||||
|
def add_vision_has_vision_encoder(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value)
|
||||||
|
|
||||||
|
def add_vision_patch_size(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)
|
||||||
|
|
||||||
|
def add_vision_embedding_length(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.ClipVision.EMBEDDING_LENGTH, value)
|
||||||
|
|
||||||
|
def add_vision_feed_forward_length(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.ClipVision.FEED_FORWARD_LENGTH, value)
|
||||||
|
|
||||||
|
def add_vision_block_count(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.ClipVision.BLOCK_COUNT, value)
|
||||||
|
|
||||||
|
def add_vision_head_count(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value)
|
||||||
|
|
||||||
|
def add_vision_projector_type(self, value: str) -> None:
|
||||||
|
self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
|
||||||
|
|
||||||
|
def add_vision_attention_layernorm_eps(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)
|
||||||
|
|
||||||
|
def add_vision_image_size(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
|
||||||
|
|
||||||
|
def add_vision_image_mean(self, values: Sequence[float]) -> None:
|
||||||
|
self.add_array(Keys.ClipVision.IMAGE_MEAN, values)
|
||||||
|
|
||||||
|
def add_vision_image_std(self, values: Sequence[float]) -> None:
|
||||||
|
self.add_array(Keys.ClipVision.IMAGE_STD, values)
|
||||||
|
|
||||||
|
def add_vision_spatial_merge_size(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.ClipVision.SPATIAL_MERGE_SIZE, value)
|
||||||
|
|
||||||
|
def add_vision_use_gelu(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.ClipVision.USE_GELU, value)
|
||||||
|
|
||||||
|
def add_vision_use_silu(self, value: bool) -> None:
|
||||||
|
self.add_bool(Keys.ClipVision.USE_SILU, value)
|
||||||
|
|
||||||
|
def add_vision_projector_scale_factor(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
|
||||||
|
|
||||||
|
def add_vision_n_wa_pattern(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
|
||||||
|
|
||||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||||
pack_prefix = ""
|
pack_prefix = ""
|
||||||
if not skip_pack_prefix:
|
if not skip_pack_prefix:
|
||||||
|
|
|
@ -201,6 +201,27 @@ def wrapped_fn(*args, **kwargs):
|
||||||
return cls(
|
return cls(
|
||||||
meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn
|
meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn
|
||||||
)
|
)
|
||||||
|
elif isinstance(res, tuple) and all(
|
||||||
|
isinstance(t, cls._tensor_type) for t in res
|
||||||
|
):
|
||||||
|
# share the evaluation between lazy tuple elements
|
||||||
|
shared_args: list = [args, None]
|
||||||
|
|
||||||
|
def eager_tuple_element(a: list[Any], i: int = 0, /, **kw) -> LazyBase:
|
||||||
|
assert len(a) == 2
|
||||||
|
if a[1] is None:
|
||||||
|
a[1] = fn(*a[0], **kw)
|
||||||
|
return a[1][i]
|
||||||
|
|
||||||
|
return tuple(
|
||||||
|
cls(
|
||||||
|
meta=cls.eager_to_meta(res[i]),
|
||||||
|
args=(shared_args, i),
|
||||||
|
kwargs=kwargs,
|
||||||
|
func=eager_tuple_element,
|
||||||
|
)
|
||||||
|
for i in range(len(res))
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
del res # not needed
|
del res # not needed
|
||||||
# non-tensor return likely relies on the contents of the args
|
# non-tensor return likely relies on the contents of the args
|
||||||
|
|
|
@ -13,7 +13,7 @@ class TensorNameMap:
|
||||||
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
|
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
|
||||||
"transformer.word_embeddings", # falcon
|
"transformer.word_embeddings", # falcon
|
||||||
"word_embeddings", # bloom
|
"word_embeddings", # bloom
|
||||||
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2
|
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414
|
||||||
"tok_embeddings", # llama-pth
|
"tok_embeddings", # llama-pth
|
||||||
"embeddings.word_embeddings", # bert nomic-bert
|
"embeddings.word_embeddings", # bert nomic-bert
|
||||||
"language_model.embedding.word_embeddings", # persimmon
|
"language_model.embedding.word_embeddings", # persimmon
|
||||||
|
@ -29,6 +29,8 @@ class TensorNameMap:
|
||||||
"shared", # t5
|
"shared", # t5
|
||||||
"rwkv.embeddings", # rwkv6
|
"rwkv.embeddings", # rwkv6
|
||||||
"model.embeddings", # rwkv7
|
"model.embeddings", # rwkv7
|
||||||
|
"model.word_embeddings", # bailingmoe
|
||||||
|
"language_model.model.embed_tokens", # llama4
|
||||||
),
|
),
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
MODEL_TENSOR.TOKEN_TYPES: (
|
MODEL_TENSOR.TOKEN_TYPES: (
|
||||||
|
@ -62,6 +64,7 @@ class TensorNameMap:
|
||||||
"output_layer", # chatglm
|
"output_layer", # chatglm
|
||||||
"head", # rwkv
|
"head", # rwkv
|
||||||
"head.out", # wavtokenizer
|
"head.out", # wavtokenizer
|
||||||
|
"lm_head", # llama4
|
||||||
),
|
),
|
||||||
# Output norm
|
# Output norm
|
||||||
MODEL_TENSOR.OUTPUT_NORM: (
|
MODEL_TENSOR.OUTPUT_NORM: (
|
||||||
|
@ -83,6 +86,7 @@ class TensorNameMap:
|
||||||
"rwkv.ln_out", # rwkv6
|
"rwkv.ln_out", # rwkv6
|
||||||
"model.ln_out", # rwkv7
|
"model.ln_out", # rwkv7
|
||||||
"backbone.final_layer_norm", # wavtokenizer
|
"backbone.final_layer_norm", # wavtokenizer
|
||||||
|
"model.norm", # llama4
|
||||||
),
|
),
|
||||||
# Rope frequencies
|
# Rope frequencies
|
||||||
MODEL_TENSOR.ROPE_FREQS: (
|
MODEL_TENSOR.ROPE_FREQS: (
|
||||||
|
@ -119,6 +123,7 @@ class TensorNameMap:
|
||||||
"transformer.layers.{bid}.attn_norm", # openelm
|
"transformer.layers.{bid}.attn_norm", # openelm
|
||||||
"rwkv.blocks.{bid}.ln1", # rwkv6
|
"rwkv.blocks.{bid}.ln1", # rwkv6
|
||||||
"model.layers.{bid}.ln1", # rwkv7
|
"model.layers.{bid}.ln1", # rwkv7
|
||||||
|
"model.layers.{bid}.input_layernorm", # llama4
|
||||||
),
|
),
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
MODEL_TENSOR.ATTN_NORM_2: (
|
MODEL_TENSOR.ATTN_NORM_2: (
|
||||||
|
@ -155,6 +160,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.attention.wq", # internlm2
|
"model.layers.{bid}.attention.wq", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok
|
||||||
"transformer.h.{bid}.attn.attention.q_proj", # exaone
|
"transformer.h.{bid}.attn.attention.q_proj", # exaone
|
||||||
|
"model.layers.{bid}.self_attn.q_proj", # llama4
|
||||||
),
|
),
|
||||||
# Attention key
|
# Attention key
|
||||||
MODEL_TENSOR.ATTN_K: (
|
MODEL_TENSOR.ATTN_K: (
|
||||||
|
@ -168,6 +174,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.attention.wk", # internlm2
|
"model.layers.{bid}.attention.wk", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok
|
||||||
"transformer.h.{bid}.attn.attention.k_proj", # exaone
|
"transformer.h.{bid}.attn.attention.k_proj", # exaone
|
||||||
|
"model.layers.{bid}.self_attn.k_proj", # llama4
|
||||||
),
|
),
|
||||||
# Attention value
|
# Attention value
|
||||||
MODEL_TENSOR.ATTN_V: (
|
MODEL_TENSOR.ATTN_V: (
|
||||||
|
@ -180,6 +187,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.attention.wv", # internlm2
|
"model.layers.{bid}.attention.wv", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok
|
||||||
"transformer.h.{bid}.attn.attention.v_proj", # exaone
|
"transformer.h.{bid}.attn.attention.v_proj", # exaone
|
||||||
|
"model.layers.{bid}.self_attn.v_proj", # llama4
|
||||||
),
|
),
|
||||||
# Attention output
|
# Attention output
|
||||||
MODEL_TENSOR.ATTN_OUT: (
|
MODEL_TENSOR.ATTN_OUT: (
|
||||||
|
@ -205,6 +213,7 @@ class TensorNameMap:
|
||||||
"encoder.layers.{bid}.self_attention.dense", # chatglm
|
"encoder.layers.{bid}.self_attention.dense", # chatglm
|
||||||
"transformer.layers.{bid}.attn.out_proj", # openelm
|
"transformer.layers.{bid}.attn.out_proj", # openelm
|
||||||
"transformer.h.{bid}.attn.attention.out_proj", # exaone
|
"transformer.h.{bid}.attn.attention.out_proj", # exaone
|
||||||
|
"model.layers.{bid}.self_attn.o_proj", # llama4
|
||||||
),
|
),
|
||||||
# Attention output norm
|
# Attention output norm
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: (
|
MODEL_TENSOR.ATTN_OUT_NORM: (
|
||||||
|
@ -214,7 +223,8 @@ class TensorNameMap:
|
||||||
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.ATTN_POST_NORM: (
|
MODEL_TENSOR.ATTN_POST_NORM: (
|
||||||
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2
|
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge
|
||||||
|
"model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
|
||||||
),
|
),
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||||
|
@ -238,6 +248,7 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||||
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
|
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
|
||||||
"transformer.layers.{bid}.ffn_norm", # openelm
|
"transformer.layers.{bid}.ffn_norm", # openelm
|
||||||
|
"model.layers.{bid}.post_attention_layernorm", # llama4
|
||||||
),
|
),
|
||||||
# Post feed-forward norm
|
# Post feed-forward norm
|
||||||
MODEL_TENSOR.FFN_PRE_NORM: (
|
MODEL_TENSOR.FFN_PRE_NORM: (
|
||||||
|
@ -246,6 +257,7 @@ class TensorNameMap:
|
||||||
# Post feed-forward norm
|
# Post feed-forward norm
|
||||||
MODEL_TENSOR.FFN_POST_NORM: (
|
MODEL_TENSOR.FFN_POST_NORM: (
|
||||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
||||||
|
"model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.FFN_GATE_INP: (
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
"layers.{bid}.feed_forward.gate", # mixtral
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
|
@ -254,6 +266,8 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.router", # Grok
|
"transformer.decoder_layer.{bid}.router", # Grok
|
||||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||||
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
|
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
|
||||||
|
"model.layers.{bid}.feed_forward.router", # llama4
|
||||||
|
"encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
|
||||||
|
@ -279,15 +293,17 @@ class TensorNameMap:
|
||||||
"h.{bid}.mlp.c_fc", # gpt2
|
"h.{bid}.mlp.c_fc", # gpt2
|
||||||
"transformer.h.{bid}.mlp.fc1", # phi2
|
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||||
"model.layers.{bid}.mlp.fc1", # phi2
|
"model.layers.{bid}.mlp.fc1", # phi2
|
||||||
"model.layers.{bid}.mlp.gate_up_proj", # phi3
|
"model.layers.{bid}.mlp.gate_up_proj", # phi3 glm-4-0414
|
||||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||||
|
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
|
||||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
||||||
"model.layers.{bid}.residual_mlp.w3", # arctic
|
"model.layers.{bid}.residual_mlp.w3", # arctic
|
||||||
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
|
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
|
||||||
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
||||||
|
"model.layers.{bid}.feed_forward.up_proj", # llama4
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||||
|
@ -295,10 +311,13 @@ class TensorNameMap:
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||||
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
|
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
||||||
|
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
||||||
|
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||||
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
|
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
|
||||||
|
"model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
|
||||||
),
|
),
|
||||||
# AWQ-activation gate
|
# AWQ-activation gate
|
||||||
MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt
|
MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt
|
||||||
|
@ -315,6 +334,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.mlp.linear_1", # refact
|
"transformer.h.{bid}.mlp.linear_1", # refact
|
||||||
"model.layers.{bid}.residual_mlp.w1", # arctic
|
"model.layers.{bid}.residual_mlp.w1", # arctic
|
||||||
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
||||||
|
"model.layers.{bid}.feed_forward.gate_proj", # llama4
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
||||||
|
@ -322,10 +342,12 @@ class TensorNameMap:
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
||||||
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
|
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
|
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
|
||||||
|
"model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||||
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
|
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
|
||||||
|
"model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
|
||||||
),
|
),
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
MODEL_TENSOR.FFN_DOWN: (
|
MODEL_TENSOR.FFN_DOWN: (
|
||||||
|
@ -353,6 +375,7 @@ class TensorNameMap:
|
||||||
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
|
||||||
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
|
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
|
||||||
"model.layers.h.{bid}.mlp.c_proj", # exaone
|
"model.layers.h.{bid}.mlp.c_proj", # exaone
|
||||||
|
"model.layers.{bid}.feed_forward.down_proj", # llama4
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||||
|
@ -361,10 +384,14 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
|
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
|
||||||
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
||||||
|
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
||||||
|
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||||
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
|
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
|
||||||
|
"model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
|
||||||
|
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
|
||||||
),
|
),
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||||
|
@ -539,6 +566,8 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.ATTN_KV_B: (
|
MODEL_TENSOR.ATTN_KV_B: (
|
||||||
"model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
|
"model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
|
||||||
),
|
),
|
||||||
|
MODEL_TENSOR.ATTN_K_B: ("model.layers.{bid}.self_attn.k_b_proj",), # deepseek2
|
||||||
|
MODEL_TENSOR.ATTN_V_B: ("model.layers.{bid}.self_attn.v_b_proj",), # deepseek2
|
||||||
MODEL_TENSOR.ATTN_Q_A_NORM: (
|
MODEL_TENSOR.ATTN_Q_A_NORM: (
|
||||||
"model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
|
"model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
|
||||||
),
|
),
|
||||||
|
@ -636,6 +665,147 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.POSNET_ATTN_OUT: (
|
MODEL_TENSOR.POSNET_ATTN_OUT: (
|
||||||
"backbone.posnet.{bid}.proj_out", # wavtokenizer
|
"backbone.posnet.{bid}.proj_out", # wavtokenizer
|
||||||
),
|
),
|
||||||
|
#############################################################################
|
||||||
|
## Vision encoder
|
||||||
|
MODEL_TENSOR.V_MMPROJ: (
|
||||||
|
"multi_modal_projector.linear_{bid}",
|
||||||
|
"visual.merger.mlp.{bid}", # qwen2vl
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||||
|
"model.connector.modality_projection.proj", # SmolVLM
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_MMPROJ_MLP: (
|
||||||
|
"model.mm_projector.mlp.mlp.{bid}",
|
||||||
|
"mlp1.{bid}", # InternVL
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_MMPROJ_PEG: ("model.mm_projector.peg.peg.{bid}",),
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_CLS: (
|
||||||
|
"vision_tower.vision_model.embeddings.class_embedding",
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
|
||||||
|
"vision_tower.vision_model.embeddings.patch_embedding",
|
||||||
|
"vpm.embeddings.patch_embedding",
|
||||||
|
"model.vision_model.embeddings.patch_embedding", # SmolVLM
|
||||||
|
"vision_tower.patch_conv", # pixtral
|
||||||
|
"visual.patch_embed.proj", # qwen2vl
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
||||||
|
"vision_tower.vision_model.embeddings.position_embedding",
|
||||||
|
"vpm.embeddings.position_embedding",
|
||||||
|
"model.vision_model.embeddings.position_embedding", # SmolVLM
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_Q: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
|
||||||
|
"vpm.encoder.layers.{bid}.self_attn.q_proj",
|
||||||
|
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
|
||||||
|
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
|
||||||
|
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_K: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||||
|
"vpm.encoder.layers.{bid}.self_attn.k_proj",
|
||||||
|
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
|
||||||
|
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
|
||||||
|
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_ATTN_V: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||||
|
"vpm.encoder.layers.{bid}.self_attn.v_proj",
|
||||||
|
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
|
||||||
|
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
|
||||||
|
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
|
||||||
|
"vpm.encoder.layers.{bid}.layer_norm1",
|
||||||
|
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
||||||
|
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
|
||||||
|
"visual.blocks.{bid}.norm1", # qwen2vl
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_OUTPUT: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
|
||||||
|
"vpm.encoder.layers.{bid}.self_attn.out_proj",
|
||||||
|
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
|
||||||
|
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
|
||||||
|
"visual.blocks.{bid}.attn.proj", # qwen2vl
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
|
||||||
|
"vpm.encoder.layers.{bid}.layer_norm2",
|
||||||
|
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
|
||||||
|
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
|
||||||
|
"visual.blocks.{bid}.norm2", # qwen2vl
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||||
|
"vpm.encoder.layers.{bid}.mlp.fc1",
|
||||||
|
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
|
||||||
|
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
|
||||||
|
"visual.blocks.{bid}.mlp.fc1", # qwen2vl
|
||||||
|
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_GATE: (
|
||||||
|
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral
|
||||||
|
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||||
|
"vpm.encoder.layers.{bid}.mlp.fc2",
|
||||||
|
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
|
||||||
|
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
|
||||||
|
"visual.blocks.{bid}.mlp.fc2", # qwen2vl
|
||||||
|
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_LAYER_SCALE_1: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_LAYER_SCALE_2: (
|
||||||
|
"vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_PRE_NORM: (
|
||||||
|
"vision_tower.vision_model.pre_layrnorm",
|
||||||
|
"vision_tower.ln_pre", # pixtral
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_POST_NORM: (
|
||||||
|
"vision_tower.vision_model.post_layernorm",
|
||||||
|
"model.vision_model.post_layernorm", # SmolVLM
|
||||||
|
"visual.merger.ln_q", # qwen2vl
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_MM_INP_PROJ: ("multi_modal_projector.mm_input_projection",),
|
||||||
|
MODEL_TENSOR.V_MM_INP_NORM: ("multi_modal_projector.norm",),
|
||||||
|
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ("multi_modal_projector.mm_soft_emb_norm",),
|
||||||
|
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ("resampler.pos_embed_k",),
|
||||||
|
MODEL_TENSOR.V_RESMPL_ATTN_Q: (
|
||||||
|
"resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_RESMPL_ATTN_K: (
|
||||||
|
"resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_RESMPL_ATTN_V: (
|
||||||
|
"resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_RESMPL_ATTN_OUT: ("resampler.attn.out_proj",),
|
||||||
|
MODEL_TENSOR.V_RESMPL_KV: ("resampler.kv_proj",),
|
||||||
|
MODEL_TENSOR.V_RESMPL_POST_NORM: ("resampler.ln_post",),
|
||||||
|
MODEL_TENSOR.V_RESMPL_KV_NORM: ("resampler.ln_kv",),
|
||||||
|
MODEL_TENSOR.V_RESMPL_Q_NORM: ("resampler.ln_q",),
|
||||||
|
MODEL_TENSOR.V_RESMPL_PROJ: ("resampler.proj",),
|
||||||
|
MODEL_TENSOR.V_RESMPL_QUERY: ("resampler.query",),
|
||||||
|
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: (
|
||||||
|
"v.token_embd.img_break", # for pixtral, this is a generated vector
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.V_MM_PATCH_MERGER: (
|
||||||
|
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
# architecture-specific block mappings
|
# architecture-specific block mappings
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
def fill_templated_filename(filename: str, output_type: str | None) -> str:
|
def fill_templated_filename(filename: str, output_type: str | None) -> str:
|
||||||
# Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf'
|
# Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf'
|
||||||
|
@ -99,3 +103,214 @@ def naming_convention(
|
||||||
kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""
|
kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""
|
||||||
|
|
||||||
return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
|
return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RemoteTensor:
|
||||||
|
dtype: str
|
||||||
|
shape: tuple[int, ...]
|
||||||
|
offset_start: int
|
||||||
|
size: int
|
||||||
|
url: str
|
||||||
|
|
||||||
|
def data(self) -> bytearray:
|
||||||
|
# TODO: handle request errors (maybe with limited retries?)
|
||||||
|
# NOTE: using a bytearray, otherwise PyTorch complains the buffer is not writeable
|
||||||
|
data = bytearray(
|
||||||
|
SafetensorRemote.get_data_by_range(
|
||||||
|
url=self.url, start=self.offset_start, size=self.size
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
class SafetensorRemote:
|
||||||
|
"""
|
||||||
|
Uility class to handle remote safetensor files.
|
||||||
|
This class is designed to work with Hugging Face model repositories.
|
||||||
|
|
||||||
|
Example (one model has single safetensor file, the other has multiple):
|
||||||
|
for model_id in ["ngxson/TEST-Tiny-Llama4", "Qwen/Qwen2.5-7B-Instruct"]:
|
||||||
|
tensors = SafetensorRemote.get_list_tensors_hf_model(model_id)
|
||||||
|
print(tensors)
|
||||||
|
|
||||||
|
Example reading tensor data:
|
||||||
|
tensors = SafetensorRemote.get_list_tensors_hf_model(model_id)
|
||||||
|
for name, meta in tensors.items():
|
||||||
|
dtype, shape, offset_start, size, remote_safetensor_url = meta
|
||||||
|
# read the tensor data
|
||||||
|
data = SafetensorRemote.get_data_by_range(remote_safetensor_url, offset_start, size)
|
||||||
|
print(data)
|
||||||
|
"""
|
||||||
|
|
||||||
|
BASE_DOMAIN = "https://huggingface.co"
|
||||||
|
ALIGNMENT = 8 # bytes
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]:
|
||||||
|
"""
|
||||||
|
Get list of tensors from a Hugging Face model repository.
|
||||||
|
|
||||||
|
Returns a dictionary of tensor names and their metadata.
|
||||||
|
Each tensor is represented as a tuple of (dtype, shape, offset_start, size, remote_safetensor_url)
|
||||||
|
"""
|
||||||
|
# case 1: model has only one single model.safetensor file
|
||||||
|
is_single_file = cls.check_file_exist(
|
||||||
|
f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors"
|
||||||
|
)
|
||||||
|
if is_single_file:
|
||||||
|
url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors"
|
||||||
|
return cls.get_list_tensors(url)
|
||||||
|
|
||||||
|
# case 2: model has multiple files
|
||||||
|
index_url = (
|
||||||
|
f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors.index.json"
|
||||||
|
)
|
||||||
|
is_multiple_files = cls.check_file_exist(index_url)
|
||||||
|
if is_multiple_files:
|
||||||
|
# read the index file
|
||||||
|
index_data = cls.get_data_by_range(index_url, 0)
|
||||||
|
index_str = index_data.decode("utf-8")
|
||||||
|
index_json = json.loads(index_str)
|
||||||
|
assert (
|
||||||
|
index_json.get("weight_map") is not None
|
||||||
|
), "weight_map not found in index file"
|
||||||
|
weight_map = index_json["weight_map"]
|
||||||
|
# get the list of files
|
||||||
|
all_files = list(set(weight_map.values()))
|
||||||
|
all_files.sort() # make sure we load shard files in order
|
||||||
|
# get the list of tensors
|
||||||
|
tensors: dict[str, RemoteTensor] = {}
|
||||||
|
for file in all_files:
|
||||||
|
url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/{file}"
|
||||||
|
for key, val in cls.get_list_tensors(url).items():
|
||||||
|
tensors[key] = val
|
||||||
|
return tensors
|
||||||
|
|
||||||
|
raise ValueError(f"Model {model_id} does not have any safetensor files")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_list_tensors(cls, url: str) -> dict[str, RemoteTensor]:
|
||||||
|
"""
|
||||||
|
Get list of tensors from a remote safetensor file.
|
||||||
|
|
||||||
|
Returns a dictionary of tensor names and their metadata.
|
||||||
|
Each tensor is represented as a tuple of (dtype, shape, offset_start, size)
|
||||||
|
"""
|
||||||
|
metadata, data_start_offset = cls.get_metadata(url)
|
||||||
|
res: dict[str, RemoteTensor] = {}
|
||||||
|
|
||||||
|
for name, meta in metadata.items():
|
||||||
|
if name == "__metadata__":
|
||||||
|
continue
|
||||||
|
if not isinstance(meta, dict):
|
||||||
|
raise ValueError(f"Invalid metadata for tensor '{name}': {meta}")
|
||||||
|
try:
|
||||||
|
dtype = meta["dtype"]
|
||||||
|
shape = meta["shape"]
|
||||||
|
offset_start_relative, offset_end_relative = meta["data_offsets"]
|
||||||
|
size = offset_end_relative - offset_start_relative
|
||||||
|
offset_start = data_start_offset + offset_start_relative
|
||||||
|
res[name] = RemoteTensor(
|
||||||
|
dtype=dtype,
|
||||||
|
shape=tuple(shape),
|
||||||
|
offset_start=offset_start,
|
||||||
|
size=size,
|
||||||
|
url=url,
|
||||||
|
)
|
||||||
|
except KeyError as e:
|
||||||
|
raise ValueError(
|
||||||
|
f"Missing key in metadata for tensor '{name}': {e}, meta = {meta}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_metadata(cls, url: str) -> tuple[dict, int]:
|
||||||
|
"""
|
||||||
|
Get JSON metadata from a remote safetensor file.
|
||||||
|
|
||||||
|
Returns tuple of (metadata, data_start_offset)
|
||||||
|
"""
|
||||||
|
# Request first 5MB of the file (hopefully enough for metadata)
|
||||||
|
read_size = 5 * 1024 * 1024
|
||||||
|
raw_data = cls.get_data_by_range(url, 0, read_size)
|
||||||
|
|
||||||
|
# Parse header
|
||||||
|
# First 8 bytes contain the metadata length as u64 little-endian
|
||||||
|
if len(raw_data) < 8:
|
||||||
|
raise ValueError("Not enough data to read metadata size")
|
||||||
|
metadata_length = int.from_bytes(raw_data[:8], byteorder="little")
|
||||||
|
|
||||||
|
# Calculate the data start offset
|
||||||
|
data_start_offset = 8 + metadata_length
|
||||||
|
alignment = SafetensorRemote.ALIGNMENT
|
||||||
|
if data_start_offset % alignment != 0:
|
||||||
|
data_start_offset += alignment - (data_start_offset % alignment)
|
||||||
|
|
||||||
|
# Check if we have enough data to read the metadata
|
||||||
|
if len(raw_data) < 8 + metadata_length:
|
||||||
|
raise ValueError(
|
||||||
|
f"Could not read complete metadata. Need {8 + metadata_length} bytes, got {len(raw_data)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract metadata bytes and parse as JSON
|
||||||
|
metadata_bytes = raw_data[8 : 8 + metadata_length]
|
||||||
|
metadata_str = metadata_bytes.decode("utf-8")
|
||||||
|
try:
|
||||||
|
metadata = json.loads(metadata_str)
|
||||||
|
return metadata, data_start_offset
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise ValueError(f"Failed to parse safetensor metadata as JSON: {e}")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_data_by_range(cls, url: str, start: int, size: int = -1) -> bytes:
|
||||||
|
"""
|
||||||
|
Get raw byte data from a remote file by range.
|
||||||
|
If size is not specified, it will read the entire file.
|
||||||
|
"""
|
||||||
|
import requests
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
if not parsed_url.scheme or not parsed_url.netloc:
|
||||||
|
raise ValueError(f"Invalid URL: {url}")
|
||||||
|
|
||||||
|
headers = cls._get_request_headers()
|
||||||
|
if size > -1:
|
||||||
|
headers["Range"] = f"bytes={start}-{start + size}"
|
||||||
|
response = requests.get(url, allow_redirects=True, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Get raw byte data
|
||||||
|
return response.content[:size]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def check_file_exist(cls, url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a file exists at the given URL.
|
||||||
|
Returns True if the file exists, False otherwise.
|
||||||
|
"""
|
||||||
|
import requests
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
if not parsed_url.scheme or not parsed_url.netloc:
|
||||||
|
raise ValueError(f"Invalid URL: {url}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
headers = cls._get_request_headers()
|
||||||
|
headers["Range"] = "bytes=0-0"
|
||||||
|
response = requests.head(url, allow_redirects=True, headers=headers)
|
||||||
|
# Success (2xx) or redirect (3xx)
|
||||||
|
return 200 <= response.status_code < 400
|
||||||
|
except requests.RequestException:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _get_request_headers(cls) -> dict[str, str]:
|
||||||
|
"""Prepare common headers for requests."""
|
||||||
|
headers = {"User-Agent": "convert_hf_to_gguf"}
|
||||||
|
if os.environ.get("HF_TOKEN"):
|
||||||
|
headers["Authorization"] = f"Bearer {os.environ['HF_TOKEN']}"
|
||||||
|
return headers
|
||||||
|
|
Loading…
Reference in New Issue