mirror of https://github.com/leafspark/AutoGGUF
feat(core): add AutoFP8 quantization classes
- add AutoFP8 quantization classes - minor fixes based on IDE recommendations
This commit is contained in:
parent
22bd74b399
commit
81b2d4137b
|
@ -0,0 +1,560 @@
|
||||||
|
import copy
|
||||||
|
import gc
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import tqdm
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
from Logger import Logger
|
||||||
|
|
||||||
|
# https://github.com/neuralmagic/AutoFP8
|
||||||
|
|
||||||
|
|
||||||
|
class BaseQuantizeConfig:
|
||||||
|
"""Configuration for model quantization.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
quant_method: Type/precision of quantization method to use.
|
||||||
|
At the moment, this is just "fp8" which specifically means
|
||||||
|
the fp8_e4m3 format in pytorch.
|
||||||
|
activation_scheme: Choice of either "dynamic" or "static" quantization
|
||||||
|
of activtions. If "static", then calibration samples are required
|
||||||
|
during quantization to produce accurate per-tensor scales for
|
||||||
|
activations of Linear modules.
|
||||||
|
ignore_patterns: List of patterns used to ignore layers. If a string
|
||||||
|
starts with "re:", then everything afterward is used as python
|
||||||
|
regex style matching i.e. re.search(), for each Linear layer.
|
||||||
|
By default, "re:.*lm_head" is included to ignore the embedding
|
||||||
|
Linear layer usually at the end of decoder LLMs
|
||||||
|
kv_cache_quant_targets: Tuple of Linear module names to target for
|
||||||
|
calibration of the output scales for KV cache quantization.
|
||||||
|
Usually, these should be `("k_proj", "v_proj")`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
quant_method: str = "fp8",
|
||||||
|
activation_scheme: str = "static",
|
||||||
|
ignore_patterns: List[str] = ["re:.*lm_head"],
|
||||||
|
kv_cache_quant_targets: Optional[Tuple[str]] = None,
|
||||||
|
):
|
||||||
|
if quant_method != "fp8":
|
||||||
|
raise ValueError("Only FP8 quantization is supported.")
|
||||||
|
if activation_scheme not in ["static", "dynamic"]:
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid activation_scheme. Choose either 'static' or 'dynamic'."
|
||||||
|
)
|
||||||
|
self.quant_method = quant_method
|
||||||
|
self.activation_scheme = activation_scheme
|
||||||
|
self.ignore_patterns = ignore_patterns
|
||||||
|
self.kv_cache_quant_targets = kv_cache_quant_targets
|
||||||
|
self.ignored_layers = []
|
||||||
|
|
||||||
|
|
||||||
|
# Class responsible for quantizing weights
|
||||||
|
class FP8DynamicLinear(torch.nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
weight: torch.Tensor,
|
||||||
|
weight_scale: torch.Tensor,
|
||||||
|
bias: torch.nn.Parameter,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.weight = torch.nn.Parameter(weight, requires_grad=False)
|
||||||
|
self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
|
||||||
|
self.bias = bias
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
qinput, x_scale = per_tensor_quantize(x)
|
||||||
|
output = fp8_gemm(
|
||||||
|
A=qinput,
|
||||||
|
A_scale=x_scale,
|
||||||
|
B=self.weight,
|
||||||
|
B_scale=self.weight_scale,
|
||||||
|
bias=self.bias,
|
||||||
|
out_dtype=x.dtype,
|
||||||
|
)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
# Module responsible for taking already quantized weights, and recording input scales (and possibly output scales)
|
||||||
|
# using an activation observer
|
||||||
|
class FP8StaticLinearQuantizer(torch.nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
weight: torch.Tensor,
|
||||||
|
weight_scale: torch.Tensor,
|
||||||
|
bias: torch.nn.Parameter,
|
||||||
|
quantize_output: bool = False,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.weight = torch.nn.Parameter(weight, requires_grad=False)
|
||||||
|
self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
|
||||||
|
self.bias = bias
|
||||||
|
self.input_scale = None
|
||||||
|
self.output_scale = None
|
||||||
|
self.quantize_output = quantize_output
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
qinput, x_input_scale = per_tensor_quantize(x)
|
||||||
|
if self.input_scale is None:
|
||||||
|
self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
|
||||||
|
elif x_input_scale > self.input_scale:
|
||||||
|
self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
|
||||||
|
output = fp8_gemm(
|
||||||
|
A=qinput,
|
||||||
|
A_scale=self.input_scale,
|
||||||
|
B=self.weight,
|
||||||
|
B_scale=self.weight_scale,
|
||||||
|
bias=self.bias,
|
||||||
|
out_dtype=x.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Optionally, quantize output and record scale
|
||||||
|
if self.quantize_output:
|
||||||
|
qoutput, output_scale = per_tensor_quantize(output)
|
||||||
|
if self.output_scale is None:
|
||||||
|
self.output_scale = torch.nn.Parameter(
|
||||||
|
output_scale, requires_grad=False
|
||||||
|
)
|
||||||
|
elif output_scale > self.output_scale:
|
||||||
|
self.output_scale = torch.nn.Parameter(
|
||||||
|
output_scale, requires_grad=False
|
||||||
|
)
|
||||||
|
output = qoutput.to(output.dtype) * output_scale
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
# Module responsible for representing the final checkpoint representation
|
||||||
|
class FP8StaticLinear(torch.nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
weight: torch.nn.Parameter,
|
||||||
|
weight_scale: torch.nn.Parameter,
|
||||||
|
bias: torch.nn.Parameter,
|
||||||
|
input_scale: torch.nn.Parameter,
|
||||||
|
output_scale: Optional[torch.nn.Parameter] = None,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.weight = weight
|
||||||
|
self.weight_scale = weight_scale
|
||||||
|
self.bias = bias
|
||||||
|
self.input_scale = input_scale
|
||||||
|
self.output_scale = output_scale
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
qinput = static_per_tensor_quantize(x, self.input_scale)
|
||||||
|
output = fp8_gemm(
|
||||||
|
A=qinput,
|
||||||
|
A_scale=self.input_scale,
|
||||||
|
B=self.weight,
|
||||||
|
B_scale=self.weight_scale,
|
||||||
|
bias=self.bias,
|
||||||
|
out_dtype=x.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.output_scale:
|
||||||
|
qoutput = static_per_tensor_quantize(output, self.output_scale)
|
||||||
|
output = qoutput.to(output.dtype) * self.output_scale
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class AutoFP8ForCausalLM:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: AutoModelForCausalLM,
|
||||||
|
quantize_config: BaseQuantizeConfig,
|
||||||
|
):
|
||||||
|
self.model = model
|
||||||
|
self.model_type = self.model.config.model_type
|
||||||
|
self.config = self.model.config
|
||||||
|
|
||||||
|
# Gather the Linear module names that we want to ignore
|
||||||
|
quantize_config.ignored_layers = get_layers_to_ignore(
|
||||||
|
self.model, quantize_config.ignore_patterns
|
||||||
|
)
|
||||||
|
|
||||||
|
if quantize_config.kv_cache_quant_targets:
|
||||||
|
kv_cache_quant_layers = get_kv_cache_quant_layers(
|
||||||
|
self.model, quantize_config.kv_cache_quant_targets
|
||||||
|
)
|
||||||
|
if len(kv_cache_quant_layers) == 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"Could not find any kv cache layers using kv_cache_quant_targets={quantize_config.kv_cache_quant_targets}, please fix your argument."
|
||||||
|
)
|
||||||
|
quantize_config.kv_cache_quant_layers = kv_cache_quant_layers
|
||||||
|
|
||||||
|
self.quantize_config = quantize_config
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(
|
||||||
|
cls,
|
||||||
|
pretrained_model_name_or_path: str,
|
||||||
|
quantize_config: BaseQuantizeConfig,
|
||||||
|
**model_init_kwargs,
|
||||||
|
):
|
||||||
|
"""Load the un-quantized pretrained model"""
|
||||||
|
|
||||||
|
def skip(*args, **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
torch.nn.init.kaiming_uniform_ = skip
|
||||||
|
torch.nn.init.uniform_ = skip
|
||||||
|
torch.nn.init.normal_ = skip
|
||||||
|
|
||||||
|
# Parameters related to loading from Hugging Face Hub
|
||||||
|
cache_dir = model_init_kwargs.pop("cache_dir", None)
|
||||||
|
force_download = model_init_kwargs.pop("force_download", False)
|
||||||
|
resume_download = model_init_kwargs.pop("resume_download", False)
|
||||||
|
proxies = model_init_kwargs.pop("proxies", None)
|
||||||
|
local_files_only = model_init_kwargs.pop("local_files_only", False)
|
||||||
|
use_auth_token = model_init_kwargs.pop("use_auth_token", None)
|
||||||
|
revision = model_init_kwargs.pop("revision", None)
|
||||||
|
subfolder = model_init_kwargs.pop("subfolder", "")
|
||||||
|
commit_hash = model_init_kwargs.pop("_commit_hash", None)
|
||||||
|
|
||||||
|
cached_file_kwargs = {
|
||||||
|
"cache_dir": cache_dir,
|
||||||
|
"force_download": force_download,
|
||||||
|
"proxies": proxies,
|
||||||
|
"resume_download": resume_download,
|
||||||
|
"local_files_only": local_files_only,
|
||||||
|
"use_auth_token": use_auth_token,
|
||||||
|
"revision": revision,
|
||||||
|
"subfolder": subfolder,
|
||||||
|
"_commit_hash": commit_hash,
|
||||||
|
}
|
||||||
|
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
# Important defaults
|
||||||
|
if "torch_dtype" not in model_init_kwargs:
|
||||||
|
model_init_kwargs["torch_dtype"] = "auto"
|
||||||
|
|
||||||
|
if "device_map" not in model_init_kwargs:
|
||||||
|
model_init_kwargs["device_map"] = "auto"
|
||||||
|
|
||||||
|
merged_kwargs = {**model_init_kwargs, **cached_file_kwargs}
|
||||||
|
print("Loading model with the following kwargs:", merged_kwargs)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
pretrained_model_name_or_path, **merged_kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
model_config = model.config.to_dict()
|
||||||
|
seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
|
||||||
|
if any(k in model_config for k in seq_len_keys):
|
||||||
|
for key in seq_len_keys:
|
||||||
|
if key in model_config:
|
||||||
|
model.seqlen = model_config[key]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("Can't get model's sequence length, setting to 2048.")
|
||||||
|
model.seqlen = 2048
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
return cls(model, quantize_config)
|
||||||
|
|
||||||
|
def quantize(self, calibration_tokens: Optional[torch.Tensor] = None):
|
||||||
|
|
||||||
|
# Always quantize the weights as they do not require calibration data
|
||||||
|
quantize_weights(self.model, self.quantize_config)
|
||||||
|
|
||||||
|
if self.quantize_config.activation_scheme == "static":
|
||||||
|
assert (
|
||||||
|
calibration_tokens is not None
|
||||||
|
), "Calibration tokens required for activation quantization"
|
||||||
|
|
||||||
|
def _prepare_calibration_data(calibration_tokens):
|
||||||
|
if hasattr(calibration_tokens, "input_ids"):
|
||||||
|
return calibration_tokens.input_ids
|
||||||
|
return calibration_tokens
|
||||||
|
|
||||||
|
quantize_activations(
|
||||||
|
self.model,
|
||||||
|
self.quantize_config,
|
||||||
|
_prepare_calibration_data(calibration_tokens),
|
||||||
|
)
|
||||||
|
|
||||||
|
def save_quantized(self, save_dir, logger):
|
||||||
|
save_quantized_model(
|
||||||
|
self.model,
|
||||||
|
quant_config=self.quantize_config,
|
||||||
|
save_dir=save_dir,
|
||||||
|
logger=logger,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_memory():
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
|
def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
|
||||||
|
"""Quantize a tensor using per-tensor static scaling factor.
|
||||||
|
Args:
|
||||||
|
tensor: The input tensor.
|
||||||
|
"""
|
||||||
|
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||||
|
# Calculate the scale as dtype max divided by absmax.
|
||||||
|
# Since .abs() creates a new tensor, we use aminmax to get
|
||||||
|
# the min and max first and then calculate the absmax.
|
||||||
|
if tensor.numel() == 0:
|
||||||
|
# Deal with empty tensors (triggered by empty MoE experts)
|
||||||
|
min_val, max_val = (
|
||||||
|
torch.tensor(-16.0, dtype=tensor.dtype),
|
||||||
|
torch.tensor(16.0, dtype=tensor.dtype),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
min_val, max_val = tensor.aminmax()
|
||||||
|
amax = torch.maximum(min_val.abs(), max_val.abs())
|
||||||
|
scale = finfo.max / amax.clamp(min=1e-12)
|
||||||
|
# Scale and clamp the tensor to bring it to
|
||||||
|
# the representative range of float8 data type
|
||||||
|
# (as default cast is unsaturated)
|
||||||
|
qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
|
||||||
|
# Return both float8 data and the inverse scale (as float),
|
||||||
|
# as both required as inputs to torch._scaled_mm
|
||||||
|
qweight = qweight.to(torch.float8_e4m3fn)
|
||||||
|
scale = scale.float().reciprocal()
|
||||||
|
return qweight, scale
|
||||||
|
|
||||||
|
|
||||||
|
def static_per_tensor_quantize(tensor: torch.Tensor, inv_scale: float) -> torch.Tensor:
|
||||||
|
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||||
|
qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
|
||||||
|
return qweight.to(torch.float8_e4m3fn)
|
||||||
|
|
||||||
|
|
||||||
|
def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype):
|
||||||
|
if A.numel() == 0:
|
||||||
|
# Deal with empty tensors (triggeted by empty MoE experts)
|
||||||
|
return torch.empty(size=(0, B.shape[0]), dtype=out_dtype, device=A.device)
|
||||||
|
|
||||||
|
# TODO: Disable native fp8 gemm for now, always just dequantize
|
||||||
|
# native_fp8_support = (
|
||||||
|
# torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9)
|
||||||
|
# )
|
||||||
|
native_fp8_support = False
|
||||||
|
if native_fp8_support:
|
||||||
|
need_reshape = A.dim() == 3
|
||||||
|
if need_reshape:
|
||||||
|
batch_size = A.shape[0]
|
||||||
|
A_input = A.reshape(-1, A.shape[-1])
|
||||||
|
else:
|
||||||
|
batch_size = None
|
||||||
|
A_input = A
|
||||||
|
output, _ = torch._scaled_mm(
|
||||||
|
A_input,
|
||||||
|
B.t(),
|
||||||
|
out_dtype=out_dtype,
|
||||||
|
scale_a=A_scale,
|
||||||
|
scale_b=B_scale,
|
||||||
|
bias=bias,
|
||||||
|
)
|
||||||
|
if need_reshape:
|
||||||
|
output = output.reshape(
|
||||||
|
batch_size, output.shape[0] // batch_size, output.shape[1]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
output = torch.nn.functional.linear(
|
||||||
|
A.to(out_dtype) * A_scale,
|
||||||
|
B.to(out_dtype) * B_scale.to(out_dtype),
|
||||||
|
bias=bias,
|
||||||
|
)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def replace_module(model: AutoModelForCausalLM, name: str, new_module: torch.nn.Module):
|
||||||
|
if "." in name:
|
||||||
|
parent_name = name.rsplit(".", 1)[0]
|
||||||
|
child_name = name[len(parent_name) + 1 :]
|
||||||
|
parent = model.get_submodule(parent_name)
|
||||||
|
else:
|
||||||
|
parent_name = ""
|
||||||
|
parent = model
|
||||||
|
child_name = name
|
||||||
|
setattr(parent, child_name, new_module)
|
||||||
|
|
||||||
|
|
||||||
|
def quantize_weights(
|
||||||
|
model: AutoModelForCausalLM,
|
||||||
|
quantize_config: BaseQuantizeConfig,
|
||||||
|
):
|
||||||
|
named_modules = list(model.named_modules())
|
||||||
|
for name, linear in tqdm.tqdm(named_modules, desc="Quantizing weights"):
|
||||||
|
if (
|
||||||
|
not isinstance(linear, torch.nn.Linear)
|
||||||
|
or name in quantize_config.ignored_layers
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
quant_weight, weight_scale = per_tensor_quantize(linear.weight)
|
||||||
|
bias = copy.deepcopy(linear.bias) if linear.bias is not None else None
|
||||||
|
quant_linear = FP8DynamicLinear(
|
||||||
|
weight=quant_weight, weight_scale=weight_scale, bias=bias
|
||||||
|
)
|
||||||
|
replace_module(model, name, quant_linear)
|
||||||
|
del linear.weight
|
||||||
|
del linear.bias
|
||||||
|
del linear
|
||||||
|
cleanup_memory()
|
||||||
|
|
||||||
|
|
||||||
|
def quantize_activations(
|
||||||
|
model: AutoModelForCausalLM,
|
||||||
|
quantize_config: BaseQuantizeConfig,
|
||||||
|
calibration_tokens,
|
||||||
|
):
|
||||||
|
# Replace weight quantizer with a dynamic activation quantizer observer
|
||||||
|
for name, dynamic_quant_linear in model.named_modules():
|
||||||
|
if (
|
||||||
|
not isinstance(dynamic_quant_linear, FP8DynamicLinear)
|
||||||
|
or name in quantize_config.ignored_layers
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
quantizer = FP8StaticLinearQuantizer(
|
||||||
|
weight=dynamic_quant_linear.weight,
|
||||||
|
weight_scale=dynamic_quant_linear.weight_scale,
|
||||||
|
bias=dynamic_quant_linear.bias,
|
||||||
|
quantize_output=(
|
||||||
|
hasattr(quantize_config, "kv_cache_quant_layers")
|
||||||
|
and name in quantize_config.kv_cache_quant_layers
|
||||||
|
),
|
||||||
|
)
|
||||||
|
replace_module(model, name, quantizer)
|
||||||
|
del dynamic_quant_linear
|
||||||
|
cleanup_memory()
|
||||||
|
|
||||||
|
# Pass through calibration data to measure activation scales
|
||||||
|
with torch.inference_mode():
|
||||||
|
with tqdm.tqdm(
|
||||||
|
total=calibration_tokens.shape[0], desc="Calibrating activation scales"
|
||||||
|
) as pbar:
|
||||||
|
for row_idx in range(calibration_tokens.shape[0]):
|
||||||
|
model(calibration_tokens[row_idx].reshape(1, -1))
|
||||||
|
cleanup_memory()
|
||||||
|
pbar.update(1)
|
||||||
|
|
||||||
|
# Replace dynamic quantizer observer with StaticLinear for export
|
||||||
|
for name, quantizer in model.named_modules():
|
||||||
|
if (
|
||||||
|
not isinstance(quantizer, FP8StaticLinearQuantizer)
|
||||||
|
or name in quantize_config.ignored_layers
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
static_proj = FP8StaticLinear(
|
||||||
|
weight=quantizer.weight,
|
||||||
|
weight_scale=quantizer.weight_scale,
|
||||||
|
bias=quantizer.bias,
|
||||||
|
input_scale=quantizer.input_scale,
|
||||||
|
output_scale=quantizer.output_scale,
|
||||||
|
)
|
||||||
|
replace_module(model, name, static_proj)
|
||||||
|
del quantizer
|
||||||
|
cleanup_memory()
|
||||||
|
|
||||||
|
# Post-process step for kv cache scales to take the k/v module
|
||||||
|
# `output_scale` parameters, and store them in the parent attention
|
||||||
|
# module as `k_scale` and `v_scale`
|
||||||
|
if hasattr(quantize_config, "kv_cache_quant_layers"):
|
||||||
|
# Assumes that list is ordered such that [layer0.k_proj, layer0.v_proj, layer1.k_proj, layer1.v_proj, ...]
|
||||||
|
# so we make a list of tuples [(layer0.k_proj, layer0.v_proj), (layer1.k_proj, layer1.v_proj), ...]
|
||||||
|
kv_proj_pairs = zip(*[iter(quantize_config.kv_cache_quant_layers)] * 2)
|
||||||
|
for k_proj_name, v_proj_name in kv_proj_pairs:
|
||||||
|
parent_module_name = ".".join(k_proj_name.split(".")[:-1])
|
||||||
|
assert parent_module_name == ".".join(v_proj_name.split(".")[:-1])
|
||||||
|
parent_module = dict(model.named_modules())[parent_module_name]
|
||||||
|
|
||||||
|
k_proj = dict(model.named_modules())[k_proj_name]
|
||||||
|
v_proj = dict(model.named_modules())[v_proj_name]
|
||||||
|
|
||||||
|
parent_module.k_scale = torch.nn.Parameter(
|
||||||
|
k_proj.output_scale, requires_grad=False
|
||||||
|
)
|
||||||
|
parent_module.v_scale = torch.nn.Parameter(
|
||||||
|
v_proj.output_scale, requires_grad=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove output_scale from k_proj and v_proj
|
||||||
|
k_proj.output_scale = None
|
||||||
|
v_proj.output_scale = None
|
||||||
|
cleanup_memory()
|
||||||
|
|
||||||
|
|
||||||
|
def save_quantized_model(
|
||||||
|
model: AutoModelForCausalLM,
|
||||||
|
quant_config: BaseQuantizeConfig,
|
||||||
|
save_dir: str,
|
||||||
|
logger: Logger,
|
||||||
|
):
|
||||||
|
logger.info(model)
|
||||||
|
logger.info(f"Saving the model to {save_dir}")
|
||||||
|
static_q_dict = {
|
||||||
|
"quantization_config": {
|
||||||
|
"quant_method": "fp8",
|
||||||
|
"activation_scheme": quant_config.activation_scheme,
|
||||||
|
"ignored_layers": quant_config.ignored_layers,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if hasattr(quant_config, "kv_cache_quant_layers"):
|
||||||
|
static_q_dict["quantization_config"]["kv_cache_scheme"] = "static"
|
||||||
|
model.config.update(static_q_dict)
|
||||||
|
model.save_pretrained(save_dir)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
|
||||||
|
tokenizer.save_pretrained(save_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def get_layers_to_ignore(model, ignore_patterns) -> List[str]:
|
||||||
|
ignored_layers = set()
|
||||||
|
|
||||||
|
for name, linear in model.named_modules():
|
||||||
|
if not isinstance(linear, torch.nn.Linear):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for ignore_pattern in ignore_patterns:
|
||||||
|
regex_prefix = "re:"
|
||||||
|
if ignore_pattern.startswith(regex_prefix):
|
||||||
|
# check if name matches regex and add to set if true
|
||||||
|
regex_pattern = ignore_pattern[len(regex_prefix) :]
|
||||||
|
if re.search(regex_pattern, name):
|
||||||
|
ignored_layers.add(name)
|
||||||
|
else:
|
||||||
|
# else, exact match
|
||||||
|
if ignore_pattern == name:
|
||||||
|
ignored_layers.add(name)
|
||||||
|
|
||||||
|
return list(ignored_layers)
|
||||||
|
|
||||||
|
|
||||||
|
def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
|
||||||
|
kv_cache_quant_layers = []
|
||||||
|
|
||||||
|
for name, linear in model.named_modules():
|
||||||
|
if not isinstance(linear, torch.nn.Linear):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for output_quant_target in kv_cache_quant_targets:
|
||||||
|
if name.endswith(output_quant_target):
|
||||||
|
kv_cache_quant_layers.append(name)
|
||||||
|
|
||||||
|
return kv_cache_quant_layers
|
||||||
|
|
||||||
|
|
||||||
|
def quantize_to_fp8_dynamic(
|
||||||
|
input_model_dir: str, output_model_dir: str, logger: Logger
|
||||||
|
) -> None:
|
||||||
|
logger.info("Starting fp8 dynamic quantization")
|
||||||
|
# Define quantization config with static activation scales
|
||||||
|
quantize_config = BaseQuantizeConfig(
|
||||||
|
quant_method="fp8", activation_scheme="dynamic"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load the model, quantize, and save checkpoint
|
||||||
|
model = AutoFP8ForCausalLM.from_pretrained(input_model_dir, quantize_config)
|
||||||
|
# No examples for dynamic quantization
|
||||||
|
model.quantize([])
|
||||||
|
model.save_quantized(output_model_dir, logger)
|
|
@ -35,9 +35,8 @@ class AutoGGUF(QMainWindow):
|
||||||
|
|
||||||
def __init__(self, args: List[str]) -> None:
|
def __init__(self, args: List[str]) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.logger = Logger("AutoGGUF", "logs")
|
|
||||||
|
|
||||||
width, height = self.parse_resolution()
|
width, height = self.parse_resolution()
|
||||||
|
self.logger = Logger("AutoGGUF", "logs")
|
||||||
|
|
||||||
self.logger.info(INITIALIZING_AUTOGGUF)
|
self.logger.info(INITIALIZING_AUTOGGUF)
|
||||||
self.setWindowTitle(WINDOW_TITLE)
|
self.setWindowTitle(WINDOW_TITLE)
|
||||||
|
@ -107,7 +106,7 @@ def __init__(self, args: List[str]) -> None:
|
||||||
self.update_gpu_offload_slider = partial(
|
self.update_gpu_offload_slider = partial(
|
||||||
ui_update.update_gpu_offload_slider, self
|
ui_update.update_gpu_offload_slider, self
|
||||||
)
|
)
|
||||||
self.update_model_info = partial(ui_update.update_model_info, self.logger, self)
|
self.update_model_info = partial(ui_update.update_model_info, self.logger)
|
||||||
self.update_system_info = partial(ui_update.update_system_info, self)
|
self.update_system_info = partial(ui_update.update_system_info, self)
|
||||||
self.update_download_progress = partial(
|
self.update_download_progress = partial(
|
||||||
ui_update.update_download_progress, self
|
ui_update.update_download_progress, self
|
||||||
|
@ -792,6 +791,8 @@ def __init__(self, args: List[str]) -> None:
|
||||||
default_theme = f.read()
|
default_theme = f.read()
|
||||||
self.setStyleSheet(default_theme)
|
self.setStyleSheet(default_theme)
|
||||||
|
|
||||||
|
self.imported_models = []
|
||||||
|
|
||||||
# Load models
|
# Load models
|
||||||
self.load_models()
|
self.load_models()
|
||||||
|
|
||||||
|
@ -1089,7 +1090,7 @@ def restart_task(self, task_item) -> None:
|
||||||
self.quant_threads.append(new_thread)
|
self.quant_threads.append(new_thread)
|
||||||
new_thread.status_signal.connect(task_item.update_status)
|
new_thread.status_signal.connect(task_item.update_status)
|
||||||
new_thread.finished_signal.connect(
|
new_thread.finished_signal.connect(
|
||||||
lambda: self.task_finished(new_thread)
|
lambda: self.task_finished(new_thread, task_item)
|
||||||
)
|
)
|
||||||
new_thread.error_signal.connect(
|
new_thread.error_signal.connect(
|
||||||
lambda err: handle_error(self.logger, err, task_item)
|
lambda err: handle_error(self.logger, err, task_item)
|
||||||
|
@ -1173,7 +1174,7 @@ def verify_gguf(self, file_path) -> bool:
|
||||||
with open(file_path, "rb") as f:
|
with open(file_path, "rb") as f:
|
||||||
magic = f.read(4)
|
magic = f.read(4)
|
||||||
return magic == b"GGUF"
|
return magic == b"GGUF"
|
||||||
except Exception:
|
except (FileNotFoundError, IOError, OSError):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def load_models(self) -> None:
|
def load_models(self) -> None:
|
||||||
|
@ -1454,7 +1455,7 @@ def quantize_model(self) -> None:
|
||||||
|
|
||||||
def parse_progress(self, line, task_item) -> None:
|
def parse_progress(self, line, task_item) -> None:
|
||||||
# Parses the output line for progress information and updates the task item.
|
# Parses the output line for progress information and updates the task item.
|
||||||
match = re.search(r"\[\s*(\d+)\s*/\s*(\d+)\s*\].*", line)
|
match = re.search(r"\[\s*(\d+)\s*/\s*(\d+)\s*].*", line)
|
||||||
if match:
|
if match:
|
||||||
current = int(match.group(1))
|
current = int(match.group(1))
|
||||||
total = int(match.group(2))
|
total = int(match.group(2))
|
||||||
|
@ -1516,8 +1517,6 @@ def import_model(self) -> None:
|
||||||
QMessageBox.StandardButton.No,
|
QMessageBox.StandardButton.No,
|
||||||
)
|
)
|
||||||
if reply == QMessageBox.StandardButton.Yes:
|
if reply == QMessageBox.StandardButton.Yes:
|
||||||
if not hasattr(self, "imported_models"):
|
|
||||||
self.imported_models = []
|
|
||||||
self.imported_models.append(file_path)
|
self.imported_models.append(file_path)
|
||||||
self.load_models()
|
self.load_models()
|
||||||
self.logger.info(MODEL_IMPORTED_SUCCESSFULLY.format(file_name))
|
self.logger.info(MODEL_IMPORTED_SUCCESSFULLY.format(file_name))
|
||||||
|
@ -1596,7 +1595,9 @@ def generate_imatrix(self) -> None:
|
||||||
self.task_list.setItemWidget(list_item, task_item)
|
self.task_list.setItemWidget(list_item, task_item)
|
||||||
|
|
||||||
thread.status_signal.connect(task_item.update_status)
|
thread.status_signal.connect(task_item.update_status)
|
||||||
thread.finished_signal.connect(lambda: self.task_finished(thread))
|
thread.finished_signal.connect(
|
||||||
|
lambda: self.task_finished(thread, task_item)
|
||||||
|
)
|
||||||
thread.error_signal.connect(
|
thread.error_signal.connect(
|
||||||
lambda err: handle_error(self.logger, err, task_item)
|
lambda err: handle_error(self.logger, err, task_item)
|
||||||
)
|
)
|
||||||
|
|
|
@ -61,7 +61,6 @@ def get_override_string(
|
||||||
"{system.hostname}": lambda: socket.gethostname(),
|
"{system.hostname}": lambda: socket.gethostname(),
|
||||||
"{system.platform}": lambda: platform.system(),
|
"{system.platform}": lambda: platform.system(),
|
||||||
"{system.python.version}": lambda: platform.python_version(),
|
"{system.python.version}": lambda: platform.python_version(),
|
||||||
"{system.time.milliseconds}": lambda: str(int(time.time() * 1000)),
|
|
||||||
"{system.date}": lambda: datetime.now().strftime("%Y-%m-%d"),
|
"{system.date}": lambda: datetime.now().strftime("%Y-%m-%d"),
|
||||||
"{model.name}": lambda: (
|
"{model.name}": lambda: (
|
||||||
model_name if model_name is not None else "Unknown Model"
|
model_name if model_name is not None else "Unknown Model"
|
||||||
|
|
|
@ -419,7 +419,7 @@ def prepare_tensors(self):
|
||||||
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}"
|
f"{f'%s-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
|
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
|
||||||
|
@ -1132,7 +1132,6 @@ def set_vocab(self):
|
||||||
try:
|
try:
|
||||||
self._set_vocab_gpt2()
|
self._set_vocab_gpt2()
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
self.gguf_writer.add_add_bos_token(False)
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
self.gguf_writer.add_pad_token_id(3)
|
self.gguf_writer.add_pad_token_id(3)
|
||||||
|
@ -4125,7 +4124,7 @@ def set_vocab(self):
|
||||||
if len(token) == 1:
|
if len(token) == 1:
|
||||||
continue
|
continue
|
||||||
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||||
assert len(merged) >= 2 and len(merged) <= 7
|
assert 2 <= len(merged) <= 7
|
||||||
merges.append(" ".join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
merges.append(" ".join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
||||||
|
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
|
|
@ -12,7 +12,7 @@ def toggle_gpu_offload_auto(self, state) -> None:
|
||||||
self.gpu_offload_spinbox.setEnabled(not is_auto)
|
self.gpu_offload_spinbox.setEnabled(not is_auto)
|
||||||
|
|
||||||
|
|
||||||
def update_model_info(logger, self, model_info) -> None:
|
def update_model_info(logger, model_info) -> None:
|
||||||
logger.debug(UPDATING_MODEL_INFO.format(model_info))
|
logger.debug(UPDATING_MODEL_INFO.format(model_info))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue