chore: update llama.cpp convert scripts

This commit is contained in:
BuildTools 2024-08-31 14:01:41 -07:00
parent fb9addb8c0
commit a59b49fd97
No known key found for this signature in database
GPG Key ID: 3270C066C15D530B
3 changed files with 75 additions and 178 deletions

View File

@ -11,3 +11,4 @@ python-dotenv~=1.0.1
safetensors~=0.4.4 safetensors~=0.4.4
setuptools~=68.2.0 setuptools~=68.2.0
huggingface-hub~=0.24.6 huggingface-hub~=0.24.6
transformers~=4.44.2

View File

@ -69,6 +69,7 @@ class Model:
model_name: str | None model_name: str | None
metadata_override: Path | None metadata_override: Path | None
dir_model_card: Path dir_model_card: Path
is_lora: bool
model_arch: gguf.MODEL_ARCH model_arch: gguf.MODEL_ARCH
@ -86,6 +87,7 @@ def __init__(
split_max_size: int = 0, split_max_size: int = 0,
dry_run: bool = False, dry_run: bool = False,
small_first_shard: bool = False, small_first_shard: bool = False,
is_lora: bool = False,
): ):
if type(self) is Model: if type(self) is Model:
raise TypeError( raise TypeError(
@ -118,6 +120,7 @@ def __init__(
self.metadata_override = metadata_override self.metadata_override = metadata_override
self.model_name = model_name self.model_name = model_name
self.dir_model_card = dir_model self.dir_model_card = dir_model
self.is_lora = is_lora
if self.ftype == gguf.LlamaFileType.GUESSED: if self.ftype == gguf.LlamaFileType.GUESSED:
@ -381,6 +384,7 @@ def prepare_tensors(self):
gguf.MODEL_TENSOR.FFN_GATE_INP, gguf.MODEL_TENSOR.FFN_GATE_INP,
gguf.MODEL_TENSOR.POS_EMBD, gguf.MODEL_TENSOR.POS_EMBD,
gguf.MODEL_TENSOR.TOKEN_TYPES, gguf.MODEL_TENSOR.TOKEN_TYPES,
gguf.MODEL_TENSOR.SSM_CONV1D,
) )
) )
or not name.endswith(".weight") or not name.endswith(".weight")
@ -1831,7 +1835,10 @@ def prepare_tensors(self):
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", "").lower() == "llama3": if rope_scaling.get("rope_type", "").lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0) base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] dim = self.hparams.get(
"head_dim",
self.hparams["hidden_size"] // self.hparams["num_attention_heads"],
)
freqs = 1.0 / ( freqs = 1.0 / (
base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim) base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
) )
@ -1860,6 +1867,7 @@ def prepare_tensors(self):
) )
rope_factors.append(1 / ((1 - smooth) / factor + smooth)) rope_factors.append(1 / ((1 - smooth) / factor + smooth))
if not self.is_lora:
self.gguf_writer.add_tensor( self.gguf_writer.add_tensor(
self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS),
np.array(rope_factors, dtype=np.float32), np.array(rope_factors, dtype=np.float32),
@ -2472,6 +2480,7 @@ def set_gguf_parameters(self):
f"The length of rope long and short factors must be {rope_dims / 2}" f"The length of rope long and short factors must be {rope_dims / 2}"
) )
if not self.is_lora:
self.gguf_writer.add_tensor( self.gguf_writer.add_tensor(
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight",
np.array(long_factors, dtype=np.float32), np.array(long_factors, dtype=np.float32),
@ -3081,7 +3090,7 @@ class StarCoder2Model(Model):
model_arch = gguf.MODEL_ARCH.STARCODER2 model_arch = gguf.MODEL_ARCH.STARCODER2
@Model.register("MambaForCausalLM", "MambaLMHeadModel") @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
class MambaModel(Model): class MambaModel(Model):
model_arch = gguf.MODEL_ARCH.MAMBA model_arch = gguf.MODEL_ARCH.MAMBA
@ -3117,6 +3126,10 @@ def set_gguf_parameters(self):
self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True)
or 1e-5 or 1e-5
) )
use_dt_b_c_norm = False
if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
use_dt_b_c_norm = True
assert d_inner == 2 * d_model assert d_inner == 2 * d_model
@ -3124,12 +3137,13 @@ def set_gguf_parameters(self):
self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(0) self.gguf_writer.add_feed_forward_length(0)
self.gguf_writer.add_head_count(0) self.gguf_writer.add_head_count(0)
self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_conv_kernel(d_conv)
self.gguf_writer.add_ssm_inner_size(d_inner) self.gguf_writer.add_ssm_inner_size(d_inner)
self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_ssm_time_step_rank(dt_rank)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm)
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
_tok_embd = None _tok_embd = None
@ -3159,25 +3173,6 @@ def modify_tensors(
return [(new_name, data_torch)] return [(new_name, data_torch)]
def tensor_force_quant(
self, name: str, new_name: str, bid: int | None, n_dims: int
) -> gguf.GGMLQuantizationType | bool:
if bid is not None and new_name in (
self.format_tensor_name(
n, bid, ".weight" if name.endswith(".weight") else ""
)
for n in [
gguf.MODEL_TENSOR.SSM_CONV1D,
gguf.MODEL_TENSOR.SSM_X,
gguf.MODEL_TENSOR.SSM_DT,
gguf.MODEL_TENSOR.SSM_A,
gguf.MODEL_TENSOR.SSM_D,
]
):
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
@Model.register("CohereForCausalLM") @Model.register("CohereForCausalLM")
class CommandR2Model(Model): class CommandR2Model(Model):
@ -4301,7 +4296,10 @@ def prepare_tensors(self):
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", "").lower() == "llama3": if rope_scaling.get("rope_type", "").lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0) base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] dim = self.hparams.get(
"head_dim",
self.hparams["hidden_size"] // self.hparams["num_attention_heads"],
)
freqs = 1.0 / ( freqs = 1.0 / (
base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim) base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
) )
@ -4330,6 +4328,7 @@ def prepare_tensors(self):
) )
rope_factors.append(1 / ((1 - smooth) / factor + smooth)) rope_factors.append(1 / ((1 - smooth) / factor + smooth))
if not self.is_lora:
self.gguf_writer.add_tensor( self.gguf_writer.add_tensor(
self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS),
np.array(rope_factors, dtype=np.float32), np.array(rope_factors, dtype=np.float32),
@ -4403,82 +4402,26 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="") parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument("--vocab-only", action="store_true")
"--vocab-only", parser.add_argument("--outfile", type=Path)
action="store_true",
help="",
)
parser.add_argument(
"--outfile",
type=Path,
help="",
)
parser.add_argument( parser.add_argument(
"--outtype", "--outtype",
type=str, type=str,
choices=["f32", "f16", "bf16", "q8_0", "auto"], choices=["f32", "f16", "bf16", "q8_0", "auto"],
default="f16", default="f16",
help="",
)
parser.add_argument(
"--bigendian",
action="store_true",
help="",
)
parser.add_argument(
"model",
type=Path,
help="",
)
parser.add_argument(
"--use-temp-file",
action="store_true",
help="",
)
parser.add_argument(
"--no-lazy",
action="store_true",
help="",
)
parser.add_argument(
"--model-name",
type=str,
default=None,
help="",
)
parser.add_argument(
"--verbose",
action="store_true",
help="",
)
parser.add_argument(
"--split-max-tensors",
type=int,
default=0,
help="",
)
parser.add_argument(
"--split-max-size",
type=str,
default="0",
help="",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="",
)
parser.add_argument(
"--no-tensor-first-split",
action="store_true",
help="",
)
parser.add_argument(
"--metadata",
type=Path,
help="",
) )
parser.add_argument("--bigendian", action="store_true")
parser.add_argument("model", type=Path)
parser.add_argument("--use-temp-file", action="store_true")
parser.add_argument("--no-lazy", action="store_true")
parser.add_argument("--model-name", type=str, default=None)
parser.add_argument("--verbose", action="store_true")
parser.add_argument("--split-max-tensors", type=int, default=0)
parser.add_argument("--split-max-size", type=str, default="0")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--no-tensor-first-split", action="store_true")
parser.add_argument("--metadata", type=Path)
return parser.parse_args() return parser.parse_args()

View File

@ -28,7 +28,6 @@
sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) sys.path.insert(1, str(Path(__file__).parent / "gguf-py"))
import gguf import gguf
# reuse model definitions from convert_hf_to_gguf.py
from convert_hf_to_gguf import LazyTorchTensor, Model from convert_hf_to_gguf import LazyTorchTensor, Model
logger = logging.getLogger("lora-to-gguf") logger = logging.getLogger("lora-to-gguf")
@ -40,10 +39,9 @@ class PartialLoraTensor:
B: Tensor | None = None B: Tensor | None = None
# magic to support tensor shape modifications and splitting
class LoraTorchTensor: class LoraTorchTensor:
_lora_A: Tensor # (n_rank, row_size) _lora_A: Tensor
_lora_B: Tensor # (col_size, n_rank) _lora_B: Tensor
_rank: int _rank: int
def __init__(self, A: Tensor, B: Tensor): def __init__(self, A: Tensor, B: Tensor):
@ -61,20 +59,14 @@ def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
def __getitem__( def __getitem__(
self, self,
indices: ( indices: SupportsIndex | slice | tuple[SupportsIndex | slice | Tensor, ...],
SupportsIndex
| slice
| tuple[
SupportsIndex | slice | Tensor, ...
] # TODO: add ellipsis in the type signature
),
) -> LoraTorchTensor: ) -> LoraTorchTensor:
shape = self.shape shape = self.shape
if isinstance(indices, SupportsIndex): if isinstance(indices, SupportsIndex):
if len(shape) > 2: if len(shape) > 2:
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
else: else:
raise NotImplementedError # can't return a vector raise NotImplementedError
elif isinstance(indices, slice): elif isinstance(indices, slice):
if len(shape) > 2: if len(shape) > 2:
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
@ -84,7 +76,7 @@ def __getitem__(
assert len(indices) > 0 assert len(indices) > 0
if indices[-1] is Ellipsis: if indices[-1] is Ellipsis:
return self[indices[:-1]] return self[indices[:-1]]
# expand ellipsis
indices = tuple( indices = tuple(
u u
for v in ( for v in (
@ -104,7 +96,6 @@ def __getitem__(
*(slice(None, None) for _ in range(len(indices), len(shape))), *(slice(None, None) for _ in range(len(indices), len(shape))),
) )
# TODO: make sure this is correct
indices_A = ( indices_A = (
*( *(
( (
@ -120,7 +111,7 @@ def __getitem__(
indices_B = indices[:-1] indices_B = indices[:-1]
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B]) return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
else: else:
raise NotImplementedError # unknown indice type raise NotImplementedError
@property @property
def dtype(self) -> torch.dtype: def dtype(self) -> torch.dtype:
@ -143,9 +134,8 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
new_shape = cast(tuple[int, ...], shape) new_shape = cast(tuple[int, ...], shape)
orig_shape = self.shape orig_shape = self.shape
if len(new_shape) < 2: if len(new_shape) < 2:
raise NotImplementedError # can't become a vector raise NotImplementedError
# expand -1 in the shape
if any(dim == -1 for dim in new_shape): if any(dim == -1 for dim in new_shape):
n_elems = prod(orig_shape) n_elems = prod(orig_shape)
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape) n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
@ -155,7 +145,7 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
) )
if new_shape[-1] != orig_shape[-1]: if new_shape[-1] != orig_shape[-1]:
raise NotImplementedError # can't reshape the row size trivially raise NotImplementedError
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1]) shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
shape_B = (*new_shape[:-1], self._rank) shape_B = (*new_shape[:-1], self._rank)
@ -174,7 +164,7 @@ def permute(self, *dims: int) -> LoraTorchTensor:
shape = self.shape shape = self.shape
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims) dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
if dims[-1] == -1: if dims[-1] == -1:
# TODO: support higher dimensional A shapes bigger than 1
assert all(dim == 1 for dim in self._lora_A.shape[:-2]) assert all(dim == 1 for dim in self._lora_A.shape[:-2])
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims)) return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1: if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
@ -182,7 +172,7 @@ def permute(self, *dims: int) -> LoraTorchTensor:
self._lora_B.permute(*dims), self._lora_A.permute(*dims) self._lora_B.permute(*dims), self._lora_A.permute(*dims)
) )
else: else:
# TODO: compose the above two
raise NotImplementedError raise NotImplementedError
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor: def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
@ -201,7 +191,7 @@ def to(self, *args, **kwargs):
@classmethod @classmethod
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
del types # unused del types
if kwargs is None: if kwargs is None:
kwargs = {} kwargs = {}
@ -245,58 +235,21 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
return base_name return base_name
def pyinstaller_include():
# PyInstaller import
pass
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser()
description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file" parser.add_argument("--outfile", type=Path)
)
parser.add_argument(
"--outfile",
type=Path,
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
)
parser.add_argument( parser.add_argument(
"--outtype", "--outtype",
type=str, type=str,
choices=["f32", "f16", "bf16", "q8_0", "auto"], choices=["f32", "f16", "bf16", "q8_0", "auto"],
default="f16", default="f16",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
)
parser.add_argument(
"--bigendian",
action="store_true",
help="model is executed on big endian machine",
)
parser.add_argument(
"--no-lazy",
action="store_true",
help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
)
parser.add_argument(
"--verbose",
action="store_true",
help="increase output verbosity",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="only print out what will be done, without writing any new files",
)
parser.add_argument(
"--base",
type=Path,
required=True,
help="directory containing base model file",
)
parser.add_argument(
"lora_path",
type=Path,
help="directory containing LoRA adapter file",
) )
parser.add_argument("--bigendian", action="store_true")
parser.add_argument("--no-lazy", action="store_true")
parser.add_argument("--verbose", action="store_true")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--base", type=Path, required=True)
parser.add_argument("lora_path", type=Path)
return parser.parse_args() return parser.parse_args()
@ -323,11 +276,11 @@ def parse_args() -> argparse.Namespace:
if args.outfile is not None: if args.outfile is not None:
fname_out = args.outfile fname_out = args.outfile
else: else:
# output in the same directory as the model by default
fname_out = dir_lora fname_out = dir_lora
if os.path.exists(input_model): if os.path.exists(input_model):
# lazy import load_file only if lora is in safetensors format.
from safetensors.torch import load_file from safetensors.torch import load_file
lora_model = load_file(input_model, device="cpu") lora_model = load_file(input_model, device="cpu")
@ -335,7 +288,6 @@ def parse_args() -> argparse.Namespace:
input_model = os.path.join(dir_lora, "adapter_model.bin") input_model = os.path.join(dir_lora, "adapter_model.bin")
lora_model = torch.load(input_model, map_location="cpu", weights_only=True) lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
# load base model
logger.info(f"Loading base model: {dir_base_model.name}") logger.info(f"Loading base model: {dir_base_model.name}")
hparams = Model.load_hparams(dir_base_model) hparams = Model.load_hparams(dir_base_model)
with torch.inference_mode(): with torch.inference_mode():
@ -431,6 +383,7 @@ def modify_tensors(
dry_run=args.dry_run, dry_run=args.dry_run,
dir_lora_model=dir_lora, dir_lora_model=dir_lora,
lora_alpha=alpha, lora_alpha=alpha,
is_lora=True,
) )
logger.info("Exporting model...") logger.info("Exporting model...")