Merge pull request #139 from leafspark/dependabot/pip/huggingface-hub-approx-eq-0.33.1

build(deps): update huggingface-hub requirement from ~=0.31.2 to ~=0.33.1
build(deps): update huggingface-hub requirement
2025-07-03 11:38:30 -07:00 · 2025-06-29 11:09:34 +00:00 · 2025-06-27 10:48:20 -07:00 · 2025-06-08 11:09:56 +00:00 · 2025-05-28 16:17:16 -07:00 · 2025-05-24 21:41:33 -07:00
78 changed files with 24484 additions and 10510 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,13 @@
+AUTOGGUF_RESOLUTION=1650x1100
+AUTOGGUF_THEME=
+AUTOGGUF_CHECK_BACKEND=disabled
+AUTOGGUF_CHECK_UPDATE=disabled
+AUTOGGUF_SERVER_API_KEY=
+AUTOGGUF_MODEL_DIR_NAME=models
+AUTOGGUF_OUTPUT_DIR_NAME=quantized_models
+AUTOGGUF_RESIZE_FACTOR=1.1
+AUTOGGUF_SERVER=enabled
+AUTOGGUF_SERVER_PORT=7001
+AUTOGGUF_SERVER_API_KEY=
+AUTOGGUF_LANGUAGE=en-US
+AUTOGGUF_BACKEND_REPO=ggerganov/llama.cpp
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -0,0 +1,38 @@
+---
+name: Bug report
+about: Create a report to help us improve AutoGGUF
+title: '[BUG] '
+labels: bug
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Environment (please complete the following information):**
+ - OS: [e.g. Windows, macOS, Linux]
+ - AutoGGUF Version: [e.g. v1.4.2]
+ - Python Version (if running from source): [e.g. 3.9]
+ - llama.cpp backend version: [e.g. 3601]
+
+**Additional context**
+Add any other context about the problem here. Include any relevant log outputs or error messages.
+
+**Checklist:**
+- [ ] I have checked the existing issues to make sure this is not a duplicate
+- [ ] I have included all relevant information to reproduce the issue
+- [ ] I am running the latest version of AutoGGUF
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,8 @@
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+      day: "sunday"
+    open-pull-requests-limit: 10
--- a/.github/workflows/black.yml
+++ b/.github/workflows/black.yml
@ -0,0 +1,20 @@
+name: Black
+
+on:
+  push:
+    paths:
+      - '**.py'
+  pull_request:
+    paths:
+      - '**.py'
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+      - uses: psf/black@stable
+        with:
+          options: "--check --verbose"
+          src: "./src"
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -0,0 +1,135 @@
+name: Build AutoGGUF (PyInstaller)
+
+on:
+  workflow_dispatch:
+    inputs:
+      build_type:
+        description: 'Build type (RELEASE or DEV)'
+        required: true
+        default: 'RELEASE'
+        type: choice
+        options:
+        - RELEASE
+        - DEV
+
+jobs:
+  build:
+    strategy:
+      matrix:
+        os: [windows-latest, ubuntu-latest, macos-latest]
+        arch: [x64]
+    runs-on: ${{ matrix.os }}
+    outputs:
+      artifact-names: ${{ steps.set-outputs.outputs.artifact-names }}
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
+        architecture: ${{ matrix.arch }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install $(grep -v "^torch" requirements.txt)
+        pip install pyinstaller pillow
+
+    - name: Build with PyInstaller (Windows)
+      if: matrix.os == 'windows-latest'
+      run: |
+        $archSuffix = if ("${{ matrix.arch }}" -eq "x86") { "-x86" } else { "-x64" }
+        if ("${{ github.event.inputs.build_type }}" -eq "RELEASE") {
+          pyinstaller --windowed --onefile --name=AutoGGUF$archSuffix --icon=../../assets/favicon_large.png --add-data "../../assets;assets" --distpath=build\release\dist --workpath=build\release\build --specpath=build\release src\main.py
+        } else {
+          pyinstaller --onefile --name=AutoGGUF$archSuffix --icon=../../assets/favicon_large.png --add-data "../../assets;assets" --distpath=build\dev\dist --workpath=build\dev\build --specpath=build\dev src\main.py
+        }
+
+    - name: Build with PyInstaller (Linux/macOS)
+      if: matrix.os != 'windows-latest'
+      run: |
+        if [ "${{ github.event.inputs.build_type }}" = "RELEASE" ]; then
+          pyinstaller --windowed --onefile --name=AutoGGUF-x64 --icon=../../assets/favicon_large.png --add-data "../../assets:assets" --distpath=build/release/dist --workpath=build/release/build --specpath=build/release src/main.py
+        else
+          pyinstaller --onefile --name=AutoGGUF-x64 --icon=../../assets/favicon_large.png --add-data "../../assets:assets" --distpath=build/dev/dist --workpath=build/dev/build --specpath=build/dev src/main.py
+        fi
+
+    - name: Copy additional files (Windows)
+      if: matrix.os == 'windows-latest'
+      run: |
+        $distPath = if ("${{ github.event.inputs.build_type }}" -eq "RELEASE") { "build\release\dist" } else { "build\dev\dist" }
+        New-Item -ItemType Directory -Force -Path "$distPath\src\gguf"
+        Copy-Item -Path "src\gguf\*" -Destination "$distPath\src\gguf" -Recurse
+        Copy-Item -Path "src\convert_hf_to_gguf.py" -Destination "$distPath\src"
+        Copy-Item -Path "src\convert_lora_to_gguf.py" -Destination "$distPath\src"
+        Copy-Item -Path "src\convert_lora_to_ggml.py" -Destination "$distPath\src"
+        Copy-Item -Path "src\quantize_to_fp8_dynamic.py" -Destination "$distPath\src"
+        Copy-Item -Path ".env.example" -Destination "$distPath\"
+
+    - name: Copy additional files (Linux/macOS)
+      if: matrix.os != 'windows-latest'
+      run: |
+        distPath=$(if [ "${{ github.event.inputs.build_type }}" = "RELEASE" ]; then echo "build/release/dist"; else echo "build/dev/dist"; fi)
+        mkdir -p $distPath/src/gguf
+        cp -R src/gguf/* $distPath/src/gguf/
+        cp src/convert_hf_to_gguf.py $distPath/src/
+        cp src/convert_lora_to_gguf.py $distPath/src/
+        cp src/convert_lora_to_ggml.py $distPath/src/
+        cp src/quantize_to_fp8_dynamic.py $distPath/src/
+        cp .env.example $distPath/
+
+    - name: Set outputs for artifact name
+      id: set-outputs
+      run: echo "artifact-name=AutoGGUF-${{ matrix.os }}-${{ matrix.arch }}-${{ github.event.inputs.build_type }}-${{ github.sha }}" >> $GITHUB_OUTPUT
+        
+    - name: Upload Artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: AutoGGUF-${{ matrix.os }}-${{ matrix.arch }}-${{ github.event.inputs.build_type }}-${{ github.sha }}
+        path: build/${{ github.event.inputs.build_type == 'RELEASE' && 'release' || 'dev' }}/dist
+
+  generate-checksums:
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+    - name: Download all artifacts
+      uses: actions/download-artifact@v4
+      with:
+        path: ./artifacts
+
+    - name: Generate SHA256 checksums for all artifacts
+      run: |
+        cd artifacts
+        versionHash=$(echo ${{ github.sha }} | cut -c1-7)
+        echo "# AutoGGUF Build Checksums" > ../checksums.txt
+        echo "Build: ${{ github.event.inputs.build_type }}" >> ../checksums.txt
+        echo "Commit: ${{ github.sha }}" >> ../checksums.txt
+        echo "Date: $(date -u)" >> ../checksums.txt
+        echo "" >> ../checksums.txt
+        
+        # Find all artifact directories and generate checksums of their zip equivalents
+        for artifact_dir in AutoGGUF-*-${{ github.event.inputs.build_type }}-${{ github.sha }}; do
+          if [ -d "$artifact_dir" ]; then
+            echo "Processing $artifact_dir..."
+            cd "$artifact_dir"
+            
+            # Create a temporary zip to calculate hash (simulating what GitHub creates)
+            zip -r "../temp_${artifact_dir}.zip" .
+            cd ..
+            
+            # Generate SHA256 of the zip file
+            hash=$(sha256sum "temp_${artifact_dir}.zip" | cut -d' ' -f1)
+            echo "${hash}  ${artifact_dir}.zip" >> ../checksums.txt
+            
+            # Clean up the temporary zip
+            rm "temp_${artifact_dir}.zip"
+          fi
+        done
+
+    - name: Upload checksums
+      uses: actions/upload-artifact@v4
+      with:
+        name: AutoGGUF-${{ github.sha }}-SHA256
+        path: checksums.txt
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@ -14,8 +14,14 @@ name: "CodeQL"
 on:
  push:
    branches: [ "main" ]
+    paths-ignore:
+      - '**/*.md'
+      - '**/*.txt'
  pull_request:
    branches: [ "main" ]
+    paths-ignore:
+      - '**/*.md'
+      - '**/*.txt'
  schedule:
    - cron: '21 20 * * 6'

@ -71,7 +77,7 @@ jobs:
        # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
        # queries: security-extended,security-and-quality

-    # If the analyze step fails for one of the languages you are analyzing with
+    # If the analysis step fails for one of the languages you are analyzing with
    # "We were unable to automatically build your code", modify the matrix above
    # to set the build mode to "manual" for that language. Then modify this step
    # to build your code.
--- a/.github/workflows/pip-audit.yml
+++ b/.github/workflows/pip-audit.yml
@ -0,0 +1,59 @@
+name: Dependency Audit
+
+on:
+  push:
+    paths:
+      - '**/requirements.txt'
+  pull_request:
+    paths:
+      - '**/requirements.txt'
+  schedule:
+    - cron: '0 0 * * *'  # Run daily at midnight UTC
+
+jobs:
+  audit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pip-audit
+
+    - name: Run pip-audit
+      run: |
+        pip-audit -r requirements.txt > audit_output.txt
+      continue-on-error: true
+
+    - name: Display audit results
+      run: cat audit_output.txt
+
+    - name: Create detailed report
+      run: |
+        echo "Pip Audit Report" > detailed_report.txt
+        echo "==================" >> detailed_report.txt
+        echo "" >> detailed_report.txt
+        echo "Date: $(date)" >> detailed_report.txt
+        echo "" >> detailed_report.txt
+        echo "Audit Results:" >> detailed_report.txt
+        cat audit_output.txt >> detailed_report.txt
+        echo "" >> detailed_report.txt
+        echo "Environment:" >> detailed_report.txt
+        python --version >> detailed_report.txt
+        pip --version >> detailed_report.txt
+        echo "" >> detailed_report.txt
+        echo "Requirements:" >> detailed_report.txt
+        cat requirements.txt >> detailed_report.txt
+
+    - name: Upload audit results
+      uses: actions/upload-artifact@v4
+      with:
+        name: pip-audit-report
+        path: detailed_report.txt
+
--- a/.github/workflows/pre-commit.yml.disabled
+++ b/.github/workflows/pre-commit.yml.disabled
@ -0,0 +1,17 @@
+name: pre-commit
+
+on: [push, pull_request]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.x'
+    - name: Install pre-commit
+      run: pip install pre-commit
+    - name: Run pre-commit
+      run: pre-commit run --all-files
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@ -1,23 +1,28 @@
 name: Pylint
-
-on: [push]
-
+on:
+  push:
+    paths:
+      - '**.py'
+  pull_request:
+    paths:
+      - '**.py'
 jobs:
  build:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.9", "3.10"]
    steps:
    - uses: actions/checkout@v4
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v5
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install pylint PyQt6 psutil requests
+        pip install $(grep -v "^torch" requirements.txt | tr '\n' ' ')
+        pip install pylint
    - name: Analysing the code with pylint
      run: |
-        pylint $(git ls-files '*.py') --disable=all --enable=E0001,E0100,E0101,E0102,E0103,E0104,E0105,E0107,E0108,E0110,E0111,E0112,E0113,E0114,E0115,E0116,E0117,E0118,E0202,E0203,E0211,E0213,E0236,E0237,E0238,E0239,E0240,E0241,E0301,E0302,E0303,E0401,E0402,E0601,E0602,E0603,E0604,E0701,E0702,E0703,E0704,E0710,E0711,E0712,E1003,E1101,E1102,E1111,E1120,E1121,E1123,E1124,E1125,E1126,E1127,E1128,E1129,E1130,E1131,E1132,E1133,E1134,E1135,E1136,E1137,E1138,E1139,E1200,E1201,E1205,E1206,E1300,E1301,E1302,E1303,E1304,E1305,E1306,E1310,E1700,E1701 --fail-under=1
+        pylint $(git ls-files '*.py') --disable=all --enable=E0001,E0100,E0101,E0102,E0103,E0104,E0105,E0107,E0108,E0110,E0111,E0112,E0113,E0114,E0115,E0116,E0117,E0118,E0202,E0203,E0211,E0213,E0236,E0237,E0238,E0239,E0240,E0241,E0301,E0302,E0303,E0401,E0402,E0701,E0702,E0703,E0704,E0710,E0711,E0712,E1003,E1101,E1102,E1111,E1120,E1121,E1123,E1124,E1125,E1126,E1127,E1128,E1129,E1130,E1131,E1132,E1133,E1134,E1135,E1136,E1137,E1138,E1139,E1200,E1201,E1205,E1206,E1300,E1301,E1302,E1303,E1304,E1305,E1306,E1310,E1700,E1701,W0311,W0312,W0611,W0612,W0613,W0702,W1401,W1402,C0123,C0200,C0325,C0411,C0412 --fail-under=5
--- a/.github/workflows/radon.yml
+++ b/.github/workflows/radon.yml
@ -0,0 +1,72 @@
+name: Radon Code Metrics
+
+on:
+  workflow_dispatch:
+  push:
+    paths:
+      - '**.py'
+  pull_request:
+    paths:
+      - '**.py'
+
+jobs:
+  radon:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.x'
+
+    - name: Install radon
+      run: pip install radon
+
+    - name: Run radon
+      run: |
+        if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
+          CHANGED_FILES=$(git ls-files '*.py')
+        else
+          CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.sha }} | grep '\.py$' || echo "")
+        fi
+        
+        echo "Files to be analyzed:"
+        echo "$CHANGED_FILES"
+        
+        if [ -n "$CHANGED_FILES" ]; then
+          echo "Running Cyclomatic Complexity check..."
+          radon cc $CHANGED_FILES -a -s -n F --exclude "AutoGGUF.quantize_model"
+          
+          echo "Running Maintainability Index check..."
+          radon mi $CHANGED_FILES -s -n F
+        else
+          echo "No Python files to analyze."
+        fi
+      continue-on-error: true
+
+    - name: Check radon output
+      run: |
+        if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
+          CHANGED_FILES=$(git ls-files '*.py')
+        else
+          CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.sha }} | grep '\.py$' || echo "")
+        fi
+        
+        if [ -n "$CHANGED_FILES" ]; then
+          CC_OUTPUT=$(radon cc $CHANGED_FILES -a -s -n F --exclude "AutoGGUF.quantize_model")
+          MI_OUTPUT=$(radon mi $CHANGED_FILES -s -n F)
+          
+          if [ -n "$CC_OUTPUT" ] || [ -n "$MI_OUTPUT" ]; then
+            echo "Radon detected code complexity or maintainability issues:"
+            [ -n "$CC_OUTPUT" ] && echo "$CC_OUTPUT"
+            [ -n "$MI_OUTPUT" ] && echo "$MI_OUTPUT"
+            exit 1
+          else
+            echo "No code complexity or maintainability issues detected."
+          fi
+        else
+          echo "No Python files to analyze."
+        fi
--- a/.gitignore
+++ b/.gitignore
@ -4,17 +4,50 @@ __pycache__/
 # Ignore everything
 *

-# But allow these file types
-!*.py
+# Allow specific file types globally
 !*.bat
 !*.txt
 !*.md
 !*.sh
 !LICENSE

-# Allow assets folder
+# Allow these files
+!.pre-commit-config.yaml
+!.env.example
+!setup.py
+
+# Allow src folder and its .py files
+!src/
+src/*
+!src/*.py
+!src/gguf
+src/gguf/*
+!src/gguf/*.py
+
+# Allow docs folder and its .py files
+!docs/
+docs/*
+!docs/*.py
+
+# Allow plugins folder and its .py files
+!plugins/
+plugins/*
+!plugins/*.py
+
+# Allow assets folder, but only .svg, .png, .rc, .css, .iss and .ico files
 !assets/
-!assets/**
+assets/*
+!assets/*.svg
+!assets/*.png
+!assets/*.ico
+!assets/*.rc
+!assets/*.res
+!assets/*.css
+!assets/*.iss
+
+# Allow .github folder and its contents
+!.github/
+!.github/**

 # Don't ignore .gitignore
 !.gitignore
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,10 @@
+repos:
+- repo: https://github.com/psf/black
+  rev: 22.10.0
+  hooks:
+    - id: black
+      language_version: python3
+- repo: https://github.com/Lucas-C/pre-commit-hooks
+  rev: v1.1.9
+  hooks:
+    - id: remove-crlf
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,14 +1,327 @@
 # Changelog

-All notable changes to this project will be documented in this file.
+## [v2.0.1] - 2025-05-24

-## [1.4.1] - 2024-08-04
+### Added
+- Human readable mappings from KV pairs into model properties
+- certifi library for backend download and update checking
+- Automated checksums in CI process
+
+### Changed
+- Updated llama.cpp backend
+- Improved backend UI, logging, and task handling
+- Enhanced display of model properties and cleaner formatting of KV pairs
+- Updated tensor data formatting and removed redundant KV pairs property
+- Updated CUDA backend check for latest llama.cpp release format
+- Global urllib usage implementation
+- Updated README with more information about patches and updates
+- Edited quick start instructions
+- Small file formatting improvements
+
+### Fixed
+- Type hints corrections
+- Build errors in CI
+- `@upload-artifact` updated to v4
+
+## [v2.0.0] - 2025-01-27
+
+### Added
+- Clipboard support for save/load preset functionality with shift-click option
+- Support for shift-clicking to get quantization command
+- AUTOGGUF_BACKEND_REPO environment variable for custom GitHub repository fetching
+- New HF to GGUF conversion types: `tq1_0` and `tq2_0`
+
+### Changed
+- Updated multiple dependencies:
+  - PySide6, PyTorch, Transformers, FastAPI, uvicorn, and other core libraries to their latest compatible versions
+- Adjusted monitoring intervals from 0.2s to 0.5s
+- Updated copyright year to 2025
+- Bundled llama.cpp licensing text in About menu
+- Removed x86 build matrix from CI
+- Removed Import Model confirmation dialog
+
+### Fixed
+- Resolved PySide6 segfault issue
+- Fixed error when deleting models from list
+- Corrected incorrect menu bar name for Load Preset
+
+## [v1.9.1] - 2024-10-13
+
+### Added
+- Support for specifying log directory name using AUTOGGUF_LOG_DIR_NAME environment variable
+- Work in progress GGUF merge window
+- Support for repository types in HF Transfer utility
+- New `dequantize_gguf.py` script
+- Support for MiniCPM3, RWKVv6, OLMoE, IBM Granite, and Jamba in llama.cpp convert scripts (conversion only)
+- Add Nuitka build script for Linux
+
+### Changed
+- Updated Finnish and Russian localizations using Claude 3 Opus
+- Improved layout of HF Upload window
+- Updated gguf library from upstream
+- Refactored code to use localizations for menubar
+- Renamed imports_and_globals.py to globals.py
+- Moved general functions verify_gguf and process_args to globals.py
+- Created Plugins class for extensibility
+- Updated dependencies:
+  - huggingface-hub
+  - fastapi (~=0.115.0)
+  - setuptools (~=75.1.0)
+  - pyside6 (~=6.7.3)
+  - uvicorn (~=0.31.0)
+
+### Fixed
+- Corrected localization strings and file select types for GGUF merging
+- Fix minor errors in build scripts
+
+## [v1.9.0] - 2024-09-15
+
+### Added
+- Implemented Hugging Face (HF) upload functionality with GUI definitions
+- Added RAM and CPU usage graphs to UI
+- Input validation using wraps added to UI
+- Right-click context menu added to the models list in UI
+- Support for iMatrix generation tracking
+- GGUF splitting feature added
+- Japanese and German localizations updated
+
+### Changed
+- Refactored to move functions out of `AutoGGUF` to reduce bloat
+- Localized GGUF split strings
+- Optimized GGUF imports and renamed related modules
+- Removed old `HFTransfer` class
+- Adjusted logging strings and updated French and Dutch localizations
+- Improved startup time by optimizing default configuration, disabling network fetches for backends/updates
+- Removed `requests` and `python-dotenv` to reduce size
+- Updated `fastapi` requirement from `~=0.112.2` to `~=0.114.2`
+- Updated `torch` requirement from `~=2.4.0` to `~=2.4.1`
+- Updated `setuptools` requirement from `~=74.0.0` to `~=74.1.2`
+- Updated `safetensors` requirement from `~=0.4.4` to `~=0.4.5`
+- Updated `huggingface-hub` requirement from `~=0.24.6` to `~=0.24.7`
+
+### Fixed
+- Adjusted indeterminate progress bar behavior
+- Removed comments in `requirements.txt` and updated its formatting
+
+## [v1.8.1] - 2024-09-04
+
+### Added
+- AutoFP8 quantization classes and window (currently WIP)
+- Minimize/maximize buttons to title bar
+- API key authentication support for the local server
+- HuggingFace upload/download class
+- OpenAPI docs for endpoints
+- Added new showcase image
+
+### Changed
+- Replaced Flask with FastAPI and Uvicorn for improved performance
+- Moved functions out of AutoGGUF.py into utils.py and TaskListItem.py
+- Updated llama.cpp convert scripts
+- Improved LoRA conversion process:
+  - Allow specifying output path in arguments
+  - Removed shutil.move operation
+  - Increased max number of LoRA layers
+- Changed default port to 7001
+- Now binding to localhost (127.0.0.1) instead of 0.0.0.0
+- Upadted Spanish localizations
+- Updated setuptools requirement from ~=68.2.0 to ~=74.0.0
+- Updated .env.example with new configuration parameters
+
+### Fixed
+- Web page not found error
+- Use of proper status in TaskListItem
+- Passing of quant_threads and Logger to TaskListItem
+- Improved window moving smoothness
+- Prevention of moving window below taskbar
+- Optimized imports in various files
+- Remove aliased quant types
+
+## [v1.8.0] - 2024-08-26
+
+### Added
+- .env.example file added
+- Sha256 generation support added to build.yml
+- Allow importing models from any directory on the system
+- Added manual model import functionality
+- Verification for manual imports and support for concatenated files
+- Implemented plugins feature using importlib
+- Configuration options for AUTOGGUF_MODEL_DIR_NAME, AUTOGGUF_OUTPUT_DIR_NAME, and AUTOGGUF_RESIZE_FACTOR added
+
+### Changed
+- Moved get helper functions to utils.py
+- Added type hints
+- Reformat TaskListItem.py for better readability
+- Separate macOS and Linux runs in CI/CD
+- Updated .gitignore for better file management
+- Updated numpy requirement from <2.0.0 to <3.0.0
+
+### Fixed
+- Fixed sha256 file format and avoided overwriting
+- Updated regex for progress tracking
+- Arabic and French localizations fixed
+- Only count valid backends instead of total backend combos
+- Import missing modules
+
+## [v1.7.2] - 2024-08-19
+
+### Added
+- Update checking support (controlled by AUTOGGUF_CHECK_UPDATE environment variable)
+- Live update support for GPU monitor graphs
+- Smoother usage bar changes in monitor
+- Unicode X button in KV Overrides box
+- PyPI setup script
+- Inno Setup build file
+- Missing requirements and dotenv file loading
+
+### Changed
+- Moved functions out of AutoGGUF.py
+- Relocated CustomTitleBar to separate file
+- Updated torch requirement from ~=2.2.0 to ~=2.4.0
+- Updated showcase image
+- Version bumped to v1.7.2 in Localizations.py
+
+### Fixed
+- setup.py issues
+
+## [v1.7.1] - 2024-08-16
+
+### Added
+- Modern UI with seamless title bar
+- Window resizing shortcuts (Ctrl+, Ctrl-, Ctrl+0)
+- Theming support
+- CPU usage bar
+- Save Preset and Load Preset options in File menu
+- Support for EXAONE model type
+- Window size configuration through environment variables
+
+### Changed
+- Refactored window to be scrollable
+- Moved save/load preset logic to presets.py
+- Updated docstrings for AutoGGUF.py, lora_conversion.py, and Logger.py
+- Adapted gguf library to project standards
+
+### Fixed
+- Updated version to v1.7.0
+- Fixed IDE-detected code typos and errors
+
+## [v1.7.0] - 2024-08-16
+
+### Added
+- Menu bar with Close and About options
+- Program version in localizations.py
+- Support for 32-bit builds
+- Added dependency audit
+- Implemented radon, dependabot, and pre-commit workflows
+
+### Changed
+- Updated torch requirement from `~=1.13.1` to `~=2.4.0`
+- Updated psutil requirement from `~=5.9.8` to `~=6.0.0`
+- Refactored functions out of AutoGGUF.py and moved to ui_update.py
+- Changed filenames to follow PEP 8 conventions
+- Disabled .md and .txt CodeQL analysis
+
+### Fixed
+- Optimized imports in AutoGGUF.py
+- Updated README with new version and styled screenshot
+- Fixed image blur in documentation
+
+## [v1.6.2] - 2024-08-15
+
+### Added
+- Server functionality with new endpoints:
+  - `/v1/backends`: Lists all backends and their paths
+  - `/v1/health`: Heartbeat endpoint
+  - `/v1/tasks`: Provides current task info (name, status, progress, log file)
+  - `/v1/models`: Retrieves model details (name, type, path, shard status)
+- Environment variable support for server configuration:
+  - `AUTOGGUF_SERVER`: Enable/disable server (true/false)
+  - `AUTOGGUF_SERVER_PORT`: Set server port (integer)
+
+### Changed
+- Updated AutoGGUF docstrings
+- Refactored build scripts
+
+### Fixed
+- Set GGML types to lowercase in command builder
+
+## [v1.6.1] - 2024-08-12
+
+### Added
+- Optimized build scripts
+- Nuitka for building
+
+### Changed
+- Updated .gitignore
+
+### Fixed
+- Bug where deletion while a task is running crashes the program
+
+### Notes
+- Fast build: Higher unzipped size (97MB), smaller download (38MB)
+- Standard build: Created with PyInstaller, medium download and unzipped size (50MB), potentially slower
+
+## [v1.6.0] - 2024-08-08
+
+### Changed
+- Resolve licensing issues by using PySide6
+
+### Added
+- Add GPU monitoring support for NVIDIA GPUs
+
+## [v1.5.1] - 2024-08-08
+
+### Changed
+- Refactor localizations to use them in HF conversion area
+- Rename FAILED_LOAD_PRESET to FAILED_TO_LOAD_PRESET localization key
+
+### Removed
+- Remove Save Preset context menu action
+
+### Added
+- Support loading *.gguf file types
+
+## [v1.5.0] - 2024-08-06
+
+### Changed
+- Refactor localizations to use them in HF conversion area
+- Organize localizations
+
+### Added
+- Add sha256 and PGP signatures (same as commit ones)
+- Add HuggingFace to GGUF conversion support
+
+### Fixed
+- Fix scaling on low resolution screens, interface now scrolls
+
+## [v1.4.3] - 2024-08-05
+
+### Changed
+- Updated src file in release to be Black formatted
+- Modifying the quantize_model function to process all selected types
+- Updating preset saving and loading to handle multiple quantization types
+- Use ERROR and IN_PROGRESS constants from localizations in QuantizationThread
+- Minor repository changes
+
+### Added
+- Added model sharding management support
+- Allow multiple quantization types to be selected and started simultaneously
+
+## [v1.4.2] - 2024-08-04
+
+### Fixed
+- Resolves bug where Base Model text was shown even when GGML type was selected
+- Improved alignment
+
+### Changed
+- Minor repository changes
+
+## [v1.4.1] - 2024-08-04

 ### Added
 - Dynamic KV Overrides (see wiki: AutoGGUF/wiki/Dynamic-KV-Overrides)
 - Quantization commands are now printed and logged

-## [1.4.0] - 2024-08-04
+## [v1.4.0] - 2024-08-04

 ### Added
 - LoRA Conversion:
@ -32,7 +345,7 @@ ### Added
  - Currently includes src folder with conversion tools
  - No console window popup

-## [1.3.1] - 2024-08-04
+## [v1.3.1] - 2024-08-04

 ### Added
 - AUTOGGUF_CHECK_BACKEND environment variable to disable backend check on start
@ -40,7 +353,7 @@ ### Added
 ### Changed
 - --onefile build with PyInstaller, _internal directory is no longer required

-## [1.3.0] - 2024-08-03
+## [v1.3.0] - 2024-08-03

 ### Added
 - Support for new llama-imatrix parameters:
@ -62,7 +375,7 @@ ### Fixed
 ### Removed
 - Duplicated functions

-## [1.2.1] - 2024-08-03
+## [v1.2.1] - 2024-08-03

 ### Added
 - Refresh Models button
@ -71,13 +384,13 @@ ### Added
 ### Fixed
 - iostream llama.cpp issue, quantized_models directory created on launch

-## [1.2.0] - 2024-08-03
+## [v1.2.0] - 2024-08-03

 ### Added
 - More robust logging (find logs at latest_<timestamp>.log in logs folder)
 - Localizations with support for 28 languages (machine translated using Gemini Experimental 0801)

-## [1.1.0] - 2024-08-03
+## [v1.1.0] - 2024-08-03

 ### Added
 - Dynamic KV override functionality
@ -100,7 +413,7 @@ ### Added
 ### Fixed
 - Issue where quantization errored with "AutoGGUF does not have x attribute"

-## [1.0.0] - 2024-08-02
+## [v1.0.0] - 2024-08-02

 ### Added
 - Initial release
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,127 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement in the Discussions tab.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -2,8 +2,6 @@ # Contributing to AutoGGUF

 First off, thanks for taking the time to contribute! 🎉👍

-## How Can I Contribute?
-
 ### Reporting Bugs

 - Use the issue tracker to report bugs
@ -15,13 +13,18 @@ ### Suggesting Enhancements
 - Use the issue tracker to suggest enhancements
 - Explain why this enhancement would be useful

-### Your First Code Contribution
+### Code Contributions
+
+You can find issues labeled with "good first issue" in the Issues tab as a starting point. Code refactors and optimizations are also appreciated, although if there's a vulnrability please report it privately in the Security tab. For feature PRs, please make a discussion first to make sure your feature can be added and continously maintained.

 1. Fork the repo
-2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
-3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
-4. Push to the branch (`git push origin feature/AmazingFeature`)
-5. Open a Pull Request
+2. Clone your fork (`git clone https://github.com/your-username/AutoGGUF.git && cd AutoGGUF`)
+3. Create your feature branch (`git checkout -b feature/AmazingFeature`)
+5. Install pre-commit: (`pip install pre-commit`)
+6. Set up the git hook scripts: (`pre-commit install`)
+7. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
+8. Push to the branch (`git push origin feature/AmazingFeature`)
+9. Open a Pull Request on GitHub

 ## Styleguides

@ -29,7 +32,7 @@ ### Git Commit Messages

 - Use the present tense ("Add feature" not "Added feature")
 - Use the imperative mood ("Move cursor to..." not "Moves cursor to...")
- Limit the first line to 72 characters or less
+- Limit the first line to 72 characters or fewer

 ### Commit Types:

@ -49,6 +52,7 @@ ### Commit Types:
 ### Python Styleguide

 - Follow PEP 8
+- Please use Black to format your code first
 - Use meaningful variable names
 - Comment your code, but don't overdo it

--- a/2
+++ b/2
@ -186,7 +186,7 @@
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

-   Copyright 2024 leafspark
+   Copyright (c) 2024-2025 leafspark

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
--- a/README.md
+++ b/README.md
@ -2,105 +2,169 @@

 # AutoGGUF - automated GGUF model quantizer

+<!-- Project Status -->
+[![GitHub release](https://img.shields.io/github/release/leafspark/AutoGGUF.svg)](https://github.com/leafspark/AutoGGUF/releases)
+[![GitHub last commit](https://img.shields.io/github/last-commit/leafspark/AutoGGUF.svg)](https://github.com/leafspark/AutoGGUF/commits)
+[![CI/CD Status](https://img.shields.io/badge/CI%2FCD-passing-brightgreen)]()
+
+<!-- Project Info -->
 [![Powered by llama.cpp](https://img.shields.io/badge/Powered%20by-llama.cpp-green.svg)](https://github.com/ggerganov/llama.cpp)
-![GitHub release](https://img.shields.io/github/release/leafspark/AutoGGUF.svg)
-![GitHub last commit](https://img.shields.io/github/last-commit/leafspark/AutoGGUF.svg)
+[![Platform Compatibility](https://img.shields.io/badge/platform-Linux%20%7C%20macOS%20%7C%20Windows-blue)]()
+[![GitHub license](https://img.shields.io/github/license/leafspark/AutoGGUF.svg)](https://github.com/leafspark/AutoGGUF/blob/main/LICENSE)
+![GitHub top language](https://img.shields.io/github/languages/top/leafspark/AutoGGUF.svg)
+
+<!-- Repository Stats -->
 ![GitHub stars](https://img.shields.io/github/stars/leafspark/AutoGGUF.svg)
 ![GitHub forks](https://img.shields.io/github/forks/leafspark/AutoGGUF.svg)
-![GitHub top language](https://img.shields.io/github/languages/top/leafspark/AutoGGUF.svg)
+![GitHub release (latest by date)](https://img.shields.io/github/downloads/leafspark/AutoGGUF/latest/total?color=green)
 ![GitHub repo size](https://img.shields.io/github/repo-size/leafspark/AutoGGUF.svg)
-![GitHub license](https://img.shields.io/github/license/leafspark/AutoGGUF.svg)
+<!-- ![Lines of Code](https://ghloc.vercel.app/leafspark/AutoGGUF?filter=.bat$,.py$,.sh$,.bat$) -->

-AutoGGUF provides a graphical user interface for quantizing GGUF models using the llama.cpp library. It allows users to download different versions of llama.cpp, manage multiple backends, and perform quantization tasks with various options.
+<!-- Contribution -->
+[![Issues](https://img.shields.io/github/issues/leafspark/AutoGGUF)](https://github.com/leafspark/AutoGGUF/issues)
+[![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/leafspark/AutoGGUF/pulls)
+
+The most comprehensive GUI tool for GGUF model quantization. Stop wrestling with command lines - quantize, merge, and optimize your models with just a few clicks.

 ## Features

- Download and manage llama.cpp backends
- Select and quantize GGUF models
- Configure quantization parameters
- Monitor system resources during quantization
+- 📩 Update and manage llama.cpp backends
+- 🗃️ Download and quantize GGUF/safetensors models
+- 📐 Configure quantization parameters
+- 💻 Monitor system resources in real time during quantization
+- ⏳ Parallel quantization + imatrix generation
+- 🎉 LoRA conversion and merging
+- 📁 Preset saving and loading
+- 8️⃣ AutoFP8 quantization
+- 🪓 GGUF splitting and merging
+- 🌐 HTTP API for automation and monitoring

-## Usage
+## Why AutoGGUF?
+- Fast: Saves time on manual configuration
+- Simple: Clean UI, no terminal needed
+- Powerful: Handles models up to infinite size, only limited by your RAM 
+- Resource-aware: Optimized memory management and efficient UI library

-### Cross-platform
-1. Install dependencies:
+![AutoGGUF-v1 8 1-showcase-blue](https://github.com/user-attachments/assets/b136ccc3-5983-4266-9e66-00cebf3ca590)
+
+## Quick Start
+
+### Cross-platform (recommended)
+1. `git clone https://github.com/leafspark/AutoGGUF`
+2. `cd AutoGGUF`
+3. Install dependencies:
   ```
   pip install -r requirements.txt
   ```
-   or
-   ```
-   pip install PyQt6 requests psutil shutil
-   ```
-2. Run the application:
+4. Run the application:
   ```
   python src/main.py
   ```
   or use the `run.bat` script.

-### Windows
+macOS and Ubuntu builds are provided with GitHub Actions, you may download the binaries in the releases section.
+
+### Windows (for the impatient)
+Standard builds:
 1. Download the latest release
 2. Extract all files to a folder
-3. Run `AutoGGUF.exe`
+3. Run `AutoGGUF-x64.exe`
+4. Any necessary folders will be automatically created
+
+Setup builds:
+1. Download the setup variant of latest release
+2. Extract all files to a folder
+3. Run the setup program
+4. The .gguf extension will be registered with the program automatically
+5. Run the program from the Start Menu or desktop shortcuts
+
+After launching the program, you may access its local server at port 7001 (set `AUTOGGUF_SERVER` to "enabled" first).
+
+### Verifying Releases
+
+#### Linux/macOS:
+```bash
+gpg --import AutoGGUF-v1.5.0-prerel.asc
+gpg --verify AutoGGUF-v1.9.1-Windows-avx2.zip.sig AutoGGUF-v1.9.1-Windows-avx2.zip
+sha256sum -c AutoGGUF-v1.9.1.sha256
+```
+
+#### Windows (PowerShell):
+```powershell
+# Import the public key
+gpg --import AutoGGUF-v1.5.0-prerel.asc
+
+# Verify the signature
+gpg --verify AutoGGUF-v1.9.1-Windows-avx2.zip.sig AutoGGUF-v1.9.1-Windows-avx2.zip
+
+# Check SHA256
+$fileHash = (Get-FileHash -Algorithm SHA256 AutoGGUF-v1.9.1-Windows-avx2.zip).Hash.ToLower()
+$storedHash = (Get-Content AutoGGUF-v1.9.1.sha256 | Select-String AutoGGUF-v1.9.1-Windows-avx2.zip).Line.Split()[0]
+if ($fileHash -eq $storedHash) { "SHA256 Match" } else { "SHA256 Mismatch" }
+```
+
+Release keys are identical to ones used for commiting.

 ## Building

 ### Cross-platform
 ```bash
-cd src
 pip install -U pyinstaller
-pyinstaller main.py --onefile
-cd dist/main
-./main
+./build.sh RELEASE | DEV
+cd build/<type>/dist/
+./AutoGGUF
 ```

 ### Windows
 ```bash
-build RELEASE/DEV
+pip install -U pyinstaller
+build RELEASE | DEV
 ```
-Find the executable in `build/<type>/dist/AutoGGUF.exe`.
+Find the executable in `build/<type>/dist/AutoGGUF-x64.exe`.

-## Dependencies
-
- PyQt6
- requests
- psutil
- shutil
- OpenSSL
+You can also use Nuitka, which may result in a slower build but a faster output executable:
+```bash
+build_optimized RELEASE | DEV
+```

 ## Localizations

 View the list of supported languages at [AutoGGUF/wiki/Installation#configuration](https://github.com/leafspark/AutoGGUF/wiki/Installation#configuration) (LLM translated, except for English).

-To use a specific language, set the `AUTOGGUF_LANGUAGE` environment variable to one of the listed language codes.
+Languages will be updated as soon as possible after an update, or as a part of the update.

-## Known Issues
+To use a specific language, set the `AUTOGGUF_LANGUAGE` environment variable to one of the listed language codes (note: some languages may not be fully supported yet, in which the UI elements will fall back to English).

- Saving preset while quantizing causes UI thread crash (planned fix: remove this feature)
- Cannot delete task while processing (planned fix: disallow deletion before cancelling or cancel automatically)
- Base Model text still shows when GGML is selected as LoRA type (fix: include text in show/hide Qt layout)
+## Issues
+
+- Some inconsistent logging and signal handling
+- Missing or duplicated translations (priority)
+- Buggy/incomplete API interfaces
+- Code review and formatting (priority)

 ## Planned Features

- Actual progress bar tracking
- Download safetensors from HF and convert to unquantized GGUF
- Perplexity testing
- Managing shards (coming in the next release)
- Time estimation for quantization
- Dynamic values for KV cache (coming in the next release)
- Ability to select and start multiple quants at once (saved in presets, coming in the next release)
+- [ ] Time estimation for quantization
+- [ ] Quantization file size estimate
+- [ ] Perplexity testing
+- [ ] bitsandbytes support

-## Troubleshooting
+#### Project Status
+
+AutoGGUF has now entered maintenance mode. It's considered stable and feature-complete for most use cases, so I'm not actively developing new features, but I’ll continue to publish occasional builds, update dependencies regularly, and fix critical bugs as needed. If you encounter issues or have suggestions, feel free to open an issue.
+
+## Support

 - SSL module cannot be found error: Install OpenSSL or run from source using `python src/main.py` with the `run.bat` script (`pip install requests`)
+- Check out the [Wiki](https://github.com/leafspark/AutoGGUF/wiki) for advanced usage and configuration

 ## Contributing

-Fork the repo, make your changes, and ensure you have the latest commits when merging. Include a changelog of new features in your pull request description.
-
-## User Interface
-
-![image](https://github.com/user-attachments/assets/2660c841-07ba-4c3f-ae3a-e63c7068bdc1)
+Fork the repo, make your changes, and ensure you have the latest commits when merging. Include a changelog of new features in your pull request description. Read `CONTRIBUTING.md` for more information.

 ## Stargazers

 [![Star History Chart](https://api.star-history.com/svg?repos=leafspark/AutoGGUF&type=Date)](https://star-history.com/#leafspark/AutoGGUF&Date)
+
+`Last Updated: May 24, 2025`
--- a/SECURITY.md
+++ b/SECURITY.md
@ -0,0 +1,13 @@
+# Security Policy
+
+## Supported Versions
+
+| Version         | Supported          |
+|-----------------|--------------------|
+| stable (v2.0.x) | :white_check_mark: |
+
+Beta versions are not officially supported and may contain unknown security vulnerabilities. Use them at your own risk.
+
+## Reporting a Vulnerability
+
+Use the Issues tab, or for severe vulnerabilities, please contact the maintainers via email.
--- a/assets/autogguf.iss
+++ b/assets/autogguf.iss
@ -0,0 +1,81 @@
+#define MyAppName "AutoGGUF"
+#define MyAppVersion "v1.7.1"
+#define MyAppPublisher "leafspark"
+#define MyAppURL "https://github.com/leafspark/AutoGGUF"
+#define MyAppExeName "AutoGGUF-x64.exe"
+#define MyAppAssocName MyAppName + " File"
+#define MyAppAssocExt ".gguf"
+#define MyAppAssocKey StringChange(MyAppAssocName, " ", "") + MyAppAssocExt
+
+[Setup]
+; NOTE: The value of AppId uniquely identifies this application. Do not use the same AppId value in installers for other applications.
+; (To generate a new GUID, click Tools | Generate GUID inside the IDE.)
+AppId={{9753D5EB-05A8-489B-86A4-FCE6341FDE0E}
+AppName={#MyAppName}
+AppVersion={#MyAppVersion}
+;AppVerName={#MyAppName} {#MyAppVersion}
+AppPublisher={#MyAppPublisher}
+AppPublisherURL={#MyAppURL}
+AppSupportURL={#MyAppURL}
+AppUpdatesURL={#MyAppURL}
+DefaultDirName={autopf}\{#MyAppName}
+; "ArchitecturesAllowed=x64compatible" specifies that Setup cannot run
+; on anything but x64 and Windows 11 on Arm.
+ArchitecturesAllowed=x64compatible
+; "ArchitecturesInstallIn64BitMode=x64compatible" requests that the
+; install be done in "64-bit mode" on x64 or Windows 11 on Arm,
+; meaning it should use the native 64-bit Program Files directory and
+; the 64-bit view of the registry.
+ArchitecturesInstallIn64BitMode=x64compatible
+ChangesAssociations=yes
+DisableProgramGroupPage=yes
+LicenseFile=F:\autogguf-release\LICENSE.txt
+; Remove the following line to run in administrative install mode (install for all users.)
+PrivilegesRequired=lowest
+PrivilegesRequiredOverridesAllowed=dialog
+OutputDir=E:\Downloads\autogguf-inno
+OutputBaseFilename=autogguf
+Compression=lzma
+SolidCompression=yes
+WizardStyle=modern
+
+[Languages]
+Name: "english"; MessagesFile: "compiler:Default.isl"
+Name: "brazilianportuguese"; MessagesFile: "compiler:Languages\BrazilianPortuguese.isl"
+Name: "dutch"; MessagesFile: "compiler:Languages\Dutch.isl"
+Name: "finnish"; MessagesFile: "compiler:Languages\Finnish.isl"
+Name: "french"; MessagesFile: "compiler:Languages\French.isl"
+Name: "german"; MessagesFile: "compiler:Languages\German.isl"
+Name: "hungarian"; MessagesFile: "compiler:Languages\Hungarian.isl"
+Name: "italian"; MessagesFile: "compiler:Languages\Italian.isl"
+Name: "japanese"; MessagesFile: "compiler:Languages\Japanese.isl"
+Name: "korean"; MessagesFile: "compiler:Languages\Korean.isl"
+Name: "polish"; MessagesFile: "compiler:Languages\Polish.isl"
+Name: "portuguese"; MessagesFile: "compiler:Languages\Portuguese.isl"
+Name: "russian"; MessagesFile: "compiler:Languages\Russian.isl"
+Name: "spanish"; MessagesFile: "compiler:Languages\Spanish.isl"
+Name: "turkish"; MessagesFile: "compiler:Languages\Turkish.isl"
+Name: "ukrainian"; MessagesFile: "compiler:Languages\Ukrainian.isl"
+
+[Tasks]
+Name: "desktopicon"; Description: "{cm:CreateDesktopIcon}"; GroupDescription: "{cm:AdditionalIcons}"; Flags: unchecked
+
+[Files]
+Source: "F:\autogguf-release\AutoGGUF-v1.7.1-Windows-avx2-standard\{#MyAppExeName}"; DestDir: "{app}"; Flags: ignoreversion
+Source: "F:\autogguf-release\AutoGGUF-v1.7.1-Windows-avx2-standard\src\*"; DestDir: "{app}"; Flags: ignoreversion recursesubdirs createallsubdirs
+; NOTE: Don't use "Flags: ignoreversion" on any shared system files
+
+[Registry]
+Root: HKA; Subkey: "Software\Classes\{#MyAppAssocExt}\OpenWithProgids"; ValueType: string; ValueName: "{#MyAppAssocKey}"; ValueData: ""; Flags: uninsdeletevalue
+Root: HKA; Subkey: "Software\Classes\{#MyAppAssocKey}"; ValueType: string; ValueName: ""; ValueData: "{#MyAppAssocName}"; Flags: uninsdeletekey
+Root: HKA; Subkey: "Software\Classes\{#MyAppAssocKey}\DefaultIcon"; ValueType: string; ValueName: ""; ValueData: "{app}\{#MyAppExeName},0"
+Root: HKA; Subkey: "Software\Classes\{#MyAppAssocKey}\shell\open\command"; ValueType: string; ValueName: ""; ValueData: """{app}\{#MyAppExeName}"" ""%1"""
+Root: HKA; Subkey: "Software\Classes\Applications\{#MyAppExeName}\SupportedTypes"; ValueType: string; ValueName: ".myp"; ValueData: ""
+
+[Icons]
+Name: "{autoprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"
+Name: "{autodesktop}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; Tasks: desktopicon
+
+[Run]
+Filename: "{app}\{#MyAppExeName}"; Description: "{cm:LaunchProgram,{#StringChange(MyAppName, '&', '&&')}}"; Flags: nowait postinstall skipifsilent
+
--- a/assets/default.css
+++ b/assets/default.css
@ -0,0 +1 @@
+/* Leave this file blank for default theme */
--- a/build.sh
+++ b/build.sh
@ -1,33 +1,20 @@
 #!/bin/bash

 if [ $# -eq 0 ]; then
-    echo "Usage: $0 [RELEASE|DEV]"
+    echo "Usage: build.sh [RELEASE|DEV]"
    exit 1
 fi

-BUILD_TYPE=$1
-ICON_PATH="../../assets/favicon_large.png"
-ASSETS_PATH="../../assets"
-SRC_PATH="src/main.py"
-
-case $BUILD_TYPE in
-    RELEASE)
-        OUTPUT_DIR="build/release"
-        EXTRA_ARGS="--windowed"
-        ;;
-    DEV)
-        OUTPUT_DIR="build/dev"
-        EXTRA_ARGS=""
-        ;;
-    *)
-        echo "Invalid build type. Use RELEASE or DEV."
-        exit 1
-        ;;
-esac
-
-echo "Building $BUILD_TYPE version..."
-
-pyinstaller $EXTRA_ARGS --onefile --name=AutoGGUF --icon=$ICON_PATH --add-data "$ASSETS_PATH:assets" --distpath=$OUTPUT_DIR/dist --workpath=$OUTPUT_DIR/build --specpath=$OUTPUT_DIR $SRC_PATH
+if [ "${1,,}" = "release" ]; then
+    echo "Building RELEASE version..."
+    pyinstaller --windowed --onefile --name=AutoGGUF --icon=../../assets/favicon_large.png --add-data "../../assets:assets" --distpath=build/release/dist --workpath=build/release/build --specpath=build/release src/main.py
+elif [ "${1,,}" = "dev" ]; then
+    echo "Building DEV version..."
+    pyinstaller --onefile --name=AutoGGUF --icon=../../assets/favicon_large.png --add-data "../../assets:assets" --distpath=build/dev/dist --workpath=build/dev/build --specpath=build/dev src/main.py
+else
+    echo "Invalid argument. Use RELEASE or DEV."
+    exit 1
+fi

 if [ $? -ne 0 ]; then
    echo "Build failed."
--- a/build_nfo.bat
+++ b/build_nfo.bat
@ -1,74 +0,0 @@
-@echo off
-cls
-echo.
-echo                      .     .
-echo                    .  ^|  .
-echo                 .    .  .    .
-echo              .   . ^|. ^|..  .
-echo             .   ^|    ^|  ^| .
-echo            . ^| ^| ^|  ^|  ^| .
-echo           ."    ^|...^| ."
-echo          ."   ." ^|  ^|.
-echo         ."   ."  ^|  ^| .
-echo        ."   ."    ^| ."
-echo      ."   ."     ."
-echo ____."   ."     ."
-echo       ."     ."
-echo      ."   ."
-echo     ."  ."         AutoGGUF Builder v1.337
-echo    ." ."        ~~~ Cracked by CODEX Team ~~~
-echo   ."."
-echo  ."."
-echo "."
-echo.
-echo +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
-echo :                         Release Notes                             :
-echo : - Now with 100%% less Python dependency!                           :
-echo : - Added quantum entanglement for faster builds                    :
-echo : - Integrated AI to write better code than you                     :
-echo : - Free pizza with every successful compile (while stocks last)    :
-echo +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
-echo :                         Installation                              :
-echo : 1. Run this totally legit .bat file                               :
-echo : 2. Choose your poison: RELEASE or DEV                             :
-echo : 3. ???                                                            :
-echo : 4. Profit!                                                        :
-echo +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
-echo :                     System Requirements                           :
-echo : - A computer (duh)                                                :
-echo : - Electricity (optional but recommended)                          :
-echo : - At least 3 brain cells                                          :
-echo +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
-echo.
-
-if "%1"=="" (
-    echo [!] ERROR: No build type specified. RTFM, n00b!
-    echo     Usage: build.bat [RELEASE^|DEV]
-    exit /b 1
-)
-
-if /I "%1"=="RELEASE" (
-    echo [+] Initiating RELEASE build sequence...
-    echo [+] Compressing code until it becomes a singularity...
-    pyinstaller --windowed --onefile --name=AutoGGUF --icon=../../assets/favicon_large.png --add-data "../../assets;assets" --distpath=build\release\dist --workpath=build\release\build --specpath=build\release src\main.py
-) else if /I "%1"=="DEV" (
-    echo [+] Launching DEV build missiles...
-    echo [+] Obfuscating code to confuse even its creator...
-    pyinstaller --onefile --name=AutoGGUF --icon=../../assets/favicon_large.png --add-data "../../assets;assets" --distpath=build\dev\dist --workpath=build\dev\build --specpath=build\dev src\main.py
-) else (
-    echo [!] FATAL ERROR: Invalid build type. Are you even trying?
-    echo     Use RELEASE or DEV, genius.
-    exit /b 1
-)
-
-if errorlevel 1 (
-    echo [-] Build failed. Blame the intern.
-    exit /b 1
-) else (
-    echo [+] Build completed successfully. Time to take credit for someone else's work!
-)
-
-echo.
-echo +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
-echo :   Remember: Piracy is wrong. Unless you're really good at it.     :
-echo +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
--- a/build_nfo.sh
+++ b/build_nfo.sh
@ -1,75 +0,0 @@
-#!/bin/bash
-
-clear
-echo ""
-echo "                      .     ."
-echo "                    .  |  ."
-echo "                 .    .  .    ."
-echo "              .   . |. |..  ."
-echo "             .   |    |  | ."
-echo "            . | | |  |  | ."
-echo "           .\"    |...| .\""
-echo "          .\"   .\" |  |."
-echo "         .\"   .\"  |  | ."
-echo "        .\"   .\"    | .\""
-echo "      .\"   .\"     .\""
-echo "______.\"   .\"     .\""
-echo "       .\"     .\""
-echo "      .\"   .\""
-echo "     .\"  .\"         AutoGGUF Builder v1.337"
-echo "    .\" .\"        ~~~ Cracked by CODEX Team ~~~"
-echo "   .\".\" "
-echo "  .\".\" "
-echo ".\". "
-echo ""
-echo "+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+"
-echo ":                         Release Notes                             :"
-echo ": - Now with 100% less Python dependency!                           :"
-echo ": - Added quantum entanglement for faster builds                    :"
-echo ": - Integrated AI to write better code than you                     :"
-echo ": - Free pizza with every successful compile (while stocks last)    :"
-echo "+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+"
-echo ":                         Installation                              :"
-echo ": 1. Run this totally legit .sh file                                :"
-echo ": 2. Choose your poison: RELEASE or DEV                             :"
-echo ": 3. ???                                                            :"
-echo ": 4. Profit!                                                        :"
-echo "+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+"
-echo ":                     System Requirements                           :"
-echo ": - A computer (duh)                                                :"
-echo ": - Electricity (optional but recommended)                          :"
-echo ": - At least 3 brain cells                                          :"
-echo "+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+"
-echo ""
-
-if [ $# -eq 0 ]; then
-    echo "[!] ERROR: No build type specified. RTFM, n00b!"
-    echo "    Usage: $0 [RELEASE|DEV]"
-    exit 1
-fi
-
-if [ "$1" = "RELEASE" ]; then
-    echo "[+] Initiating RELEASE build sequence..."
-    echo "[+] Compressing code until it becomes a singularity..."
-    pyinstaller --windowed --onefile --name=AutoGGUF --icon=../../assets/favicon_large.png --add-data "../../assets:assets" --distpath=build/release/dist --workpath=build/release/build --specpath=build/release src/main.py
-elif [ "$1" = "DEV" ]; then
-    echo "[+] Launching DEV build missiles..."
-    echo "[+] Obfuscating code to confuse even its creator..."
-    pyinstaller --onefile --name=AutoGGUF --icon=../../assets/favicon_large.png --add-data "../../assets:assets" --distpath=build/dev/dist --workpath=build/dev/build --specpath=build/dev src/main.py
-else
-    echo "[!] FATAL ERROR: Invalid build type. Are you even trying?"
-    echo "    Use RELEASE or DEV, genius."
-    exit 1
-fi
-
-if [ $? -ne 0 ]; then
-    echo "[-] Build failed. Blame the intern."
-    exit 1
-else
-    echo "[+] Build completed successfully. Time to take credit for someone else's work!"
-fi
-
-echo ""
-echo "+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+"
-echo ":   Remember: Piracy is wrong. Unless you're really good at it.     :"
-echo "+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+"
--- a/build_optimized.bat
+++ b/build_optimized.bat
@ -0,0 +1,26 @@
+@echo off
+
+if "%1"=="" (
+    echo Usage: build_optimized.bat [RELEASE^|DEV]
+    exit /b 1
+)
+
+set COMMON_FLAGS=--standalone --enable-plugin=pyside6 --include-data-dir=assets=assets
+
+if /I "%1"=="RELEASE" (
+    echo Building RELEASE version...
+    python -m nuitka %COMMON_FLAGS% --windows-console-mode=disable --output-dir=build\release src\main.py --lto=yes
+) else if /I "%1"=="DEV" (
+    echo Building DEV version...
+    python -m nuitka %COMMON_FLAGS% --output-dir=build\dev src\main.py
+) else (
+    echo Invalid argument. Use RELEASE or DEV.
+    exit /b 1
+)
+
+if errorlevel 1 (
+    echo Build failed.
+    exit /b 1
+) else (
+    echo Build completed successfully.
+)
--- a/build_optimized.sh
+++ b/build_optimized.sh
@ -0,0 +1,26 @@
+#!/bin/bash
+
+if [ -z "$1" ]; then
+    echo "Usage: build_fast.sh [RELEASE|DEV]"
+    exit 1
+fi
+
+COMMON_FLAGS="--standalone --enable-plugin=pyside6 --include-data-dir=assets=assets"
+
+if [ "$1" == "RELEASE" ]; then
+    echo "Building RELEASE version..."
+    python -m nuitka $COMMON_FLAGS --windows-console-mode=disable --output-dir=build/release src/main.py --lto=yes
+elif [ "$1" == "DEV" ]; then
+    echo "Building DEV version..."
+    python -m nuitka $COMMON_FLAGS --output-dir=build/dev src/main.py
+else
+    echo "Invalid argument. Use RELEASE or DEV."
+    exit 1
+fi
+
+if [ $? -ne 0 ]; then
+    echo "Build failed."
+    exit 1
+else
+    echo "Build completed successfully."
+fi
--- a/docs/AutoGGUF.py
+++ b/docs/AutoGGUF.py
@ -0,0 +1,319 @@
+import importlib
+import json
+import re
+import shutil
+from datetime import datetime
+from functools import partial
+from typing import Any, Dict, List, Tuple
+
+import requests
+from PySide6.QtCore import *
+from PySide6.QtGui import *
+from PySide6.QtWidgets import *
+from dotenv import load_dotenv
+
+import lora_conversion
+import presets
+import ui_update
+import utils
+from CustomTitleBar import CustomTitleBar
+from GPUMonitor import GPUMonitor
+from Localizations import *
+from Logger import Logger
+from QuantizationThread import QuantizationThread
+from TaskListItem import TaskListItem
+from error_handling import handle_error, show_error
+from imports_and_globals import (
+    ensure_directory,
+    open_file_safe,
+    resource_path,
+    show_about,
+)
+
+
+class CustomTitleBar(QWidget):
+    """
+    Custom title bar for the main window, providing drag-and-drop functionality
+    and minimize/close buttons.
+    """
+
+    def __init__(self, parent=None):
+        """
+        Initializes the custom title bar.
+
+        Args:
+            parent (QWidget, optional): The parent widget. Defaults to None.
+        """
+
+
+class AutoGGUF(QMainWindow):
+    """
+    Main application window for AutoGGUF, providing a user interface for
+    quantizing and converting large language models.
+    """
+
+    def __init__(self):
+        """
+        Initializes the main window, setting up the UI, logger, and other
+        necessary components.
+        """
+
+    def keyPressEvent(self, event):
+        """
+        Handles key press events for window resizing.
+
+        Args:
+            event (QKeyEvent): The key press event.
+        """
+
+    def resize_window(self, larger):
+        """
+        Resizes the window by a specified factor.
+
+        Args:
+            larger (bool): Whether to make the window larger or smaller.
+        """
+
+    def reset_size(self):
+        """Resets the window to its default size."""
+
+    def parse_resolution(self):
+        """
+        Parses the resolution from the AUTOGGUF_RESOLUTION environment variable.
+
+        Returns:
+            tuple: The width and height of the window.
+        """
+
+    def resizeEvent(self, event):
+        """
+        Handles resize events to maintain rounded corners.
+
+        Args:
+            event (QResizeEvent): The resize event.
+        """
+
+    def refresh_backends(self):
+        """Refreshes the list of available backends."""
+
+    def save_task_preset(self, task_item):
+        """
+        Saves the preset for a specific task.
+
+        Args:
+            task_item (TaskListItem): The task item to save the preset for.
+        """
+
+    def browse_export_lora_model(self):
+        """Opens a file dialog to browse for the export LORA model file."""
+
+    def browse_export_lora_output(self):
+        """Opens a file dialog to browse for the export LORA output file."""
+
+    def add_lora_adapter(self):
+        """Adds a LORA adapter to the export LORA list."""
+
+    def browse_base_model(self):
+        """Opens a file dialog to browse for the base model folder."""
+
+    def delete_lora_adapter_item(self, adapter_widget):
+        """
+        Deletes a LORA adapter item from the export LORA list.
+
+        Args:
+            adapter_widget (QWidget): The widget containing the adapter information.
+        """
+
+    def browse_hf_model_input(self):
+        """Opens a file dialog to browse for the HuggingFace model directory."""
+
+    def browse_hf_outfile(self):
+        """Opens a file dialog to browse for the HuggingFace to GGUF output file."""
+
+    def convert_hf_to_gguf(self):
+        """Converts a HuggingFace model to GGUF format."""
+
+    def export_lora(self):
+        """Exports a LORA from a GGML model."""
+
+    def restart_task(self, task_item):
+        """
+        Restarts a specific task.
+
+        Args:
+            task_item (TaskListItem): The task item to restart.
+        """
+
+    def lora_conversion_finished(self, thread, input_path, output_path):
+        """
+        Handles the completion of a LORA conversion task.
+
+        Args:
+            thread (QuantizationThread): The thread that handled the conversion.
+            input_path (str): The path to the input LORA file.
+            output_path (str): The path to the output GGML file.
+        """
+
+    def download_finished(self, extract_dir):
+        """
+        Handles the completion of a download, extracting files and updating the UI.
+
+        Args:
+            extract_dir (str): The directory where the downloaded files were extracted.
+        """
+
+    def extract_cuda_files(self, extract_dir, destination):
+        """
+        Extracts CUDA files from a downloaded archive.
+
+        Args:
+            extract_dir (str): The directory where the downloaded files were extracted.
+            destination (str): The destination directory for the CUDA files.
+        """
+
+    def download_error(self, error_message):
+        """
+        Handles download errors, displaying an error message and cleaning up.
+
+        Args:
+            error_message (str): The error message.
+        """
+
+    def show_task_context_menu(self, position):
+        """
+        Shows the context menu for a task item in the task list.
+
+        Args:
+            position (QPoint): The position of the context menu.
+        """
+
+    def show_task_properties(self, item):
+        """
+        Shows the properties dialog for a specific task.
+
+        Args:
+            item (QListWidgetItem): The task item.
+        """
+
+    def toggle_gpu_offload_auto(self, state):
+        """
+        Toggles the automatic GPU offload option.
+
+        Args:
+            state (Qt.CheckState): The state of the checkbox.
+        """
+
+    def cancel_task_by_item(self, item):
+        """
+        Cancels a task by its item in the task list.
+
+        Args:
+            item (QListWidgetItem): The task item.
+        """
+
+    def cancel_task(self, item):
+        """
+        Cancels a specific task.
+
+        Args:
+            item (QListWidgetItem): The task item.
+        """
+
+    def delete_task(self, item):
+        """
+        Deletes a specific task.
+
+        Args:
+            item (QListWidgetItem): The task item.
+        """
+
+    def create_label(self, text, tooltip):
+        """
+        Creates a QLabel with a tooltip.
+
+        Args:
+            text (str): The text for the label.
+            tooltip (str): The tooltip for the label.
+
+        Returns:
+            QLabel: The created label.
+        """
+
+    def load_models(self):
+        """Loads the available models and displays them in the model tree."""
+
+    def browse_models(self):
+        """Opens a file dialog to browse for the models directory."""
+
+    def browse_output(self):
+        """Opens a file dialog to browse for the output directory."""
+
+    def browse_logs(self):
+        """Opens a file dialog to browse for the logs directory."""
+
+    def browse_imatrix(self):
+        """Opens a file dialog to browse for the imatrix file."""
+
+    def validate_quantization_inputs(self):
+        """Validates the inputs for quantization."""
+
+    def add_kv_override(self, override_string=None):
+        """Adds a KV override entry to the list."""
+
+    def remove_kv_override(self, entry):
+        """Removes a KV override entry from the list."""
+
+    def quantize_model(self):
+        """Quantizes the selected model."""
+
+    def parse_progress(self, line, task_item):
+        """
+        Parses the progress from the output line and updates the task item.
+
+        Args:
+            line (str): The output line.
+            task_item (TaskListItem): The task item.
+        """
+
+    def task_finished(self, thread, task_item):
+        """
+        Handles the completion of a task.
+
+        Args:
+            thread (QuantizationThread): The thread that handled the task.
+            task_item (TaskListItem): The task item.
+        """
+
+    def show_task_details(self, item):
+        """
+        Shows the details of a specific task.
+
+        Args:
+            item (QListWidgetItem): The task item.
+        """
+
+    def browse_imatrix_datafile(self):
+        """Opens a file dialog to browse for the imatrix data file."""
+
+    def browse_imatrix_model(self):
+        """Opens a file dialog to browse for the imatrix model file."""
+
+    def browse_imatrix_output(self):
+        """Opens a file dialog to browse for the imatrix output file."""
+
+    def get_models_data(self):
+        """Retrieves data for all loaded models."""
+
+    def get_tasks_data(self):
+        """Retrieves data for all tasks in the task list."""
+
+    def generate_imatrix(self):
+        """Generates an imatrix file."""
+
+    def closeEvent(self, event: QCloseEvent):
+        """
+        Handles close events, prompting the user if there are running tasks.
+
+        Args:
+            event (QCloseEvent): The close event.
+        """
--- a/docs/DownloadThread.py
+++ b/docs/DownloadThread.py
@ -0,0 +1,44 @@
+import os
+import zipfile
+
+import requests
+from PySide6.QtCore import QThread, Signal
+
+
+class DownloadThread(QThread):
+    """
+    A QThread subclass for downloading and extracting zip files.
+
+    This thread downloads a file from a given URL, saves it to a specified path,
+    extracts its contents if it's a zip file, and then removes the original zip file.
+
+    Signals:
+        progress_signal (int): Emits the download progress as a percentage.
+        finished_signal (str): Emits the path of the extracted directory upon successful completion.
+        error_signal (str): Emits an error message if an exception occurs during the process.
+    """
+
+    def __init__(self, url: str, save_path: str) -> None:
+        """
+        Initialize the DownloadThread.
+
+        Args:
+            url (str): The URL of the file to download.
+            save_path (str): The local path where the file will be saved.
+        """
+
+    def run(self) -> None:
+        """
+        Execute the download, extraction, and cleanup process.
+
+        This method performs the following steps:
+        1. Downloads the file from the specified URL.
+        2. Saves the file to the specified path.
+        3. Extracts the contents if it's a zip file.
+        4. Removes the original zip file after extraction.
+        5. Emits signals for progress updates, completion, or errors.
+
+        Raises:
+            Exception: Any exception that occurs during the process is caught
+                       and emitted through the error_signal.
+        """
--- a/docs/KVOverrideEntry.py
+++ b/docs/KVOverrideEntry.py
@ -0,0 +1,87 @@
+class KVOverrideEntry(QWidget):
+    """
+    KVOverrideEntry is a PyQt6-based widget for creating and managing key-value override entries.
+
+    This class provides functionality for:
+    - Inputting keys and values with type specification
+    - Dynamic value substitution using predefined placeholders
+    - Validating inputs based on selected data types
+    - Generating formatted override strings
+
+    The widget includes input fields for keys and values, a type selector,
+    and a delete button. It supports various system-related and custom placeholders
+    for dynamic value generation.
+
+    Attributes:
+        deleted (pyqtSignal): Signal emitted when the entry is deleted.
+        key_input (QLineEdit): Input field for the key.
+        type_combo (QComboBox): Dropdown for selecting the value type.
+        value_input (QLineEdit): Input field for the value.
+
+    Supported dynamic placeholders:
+        {system.time.milliseconds}: Current time in milliseconds
+        {system.time.seconds}: Current time in seconds
+        {system.date.iso}: Current date in ISO format
+        {system.datetime.iso}: Current date and time in ISO format
+        {system.username}: Current system username
+        {system.hostname}: Current system hostname
+        {system.platform}: Current operating system platform
+        {system.python.version}: Python version
+        {system.date}: Current date in YYYY-MM-DD format
+        {model.name}: Model name (if provided)
+        {quant.type}: Quantization type (if provided)
+        {output.path}: Output path (if provided)
+    """
+
+    def __init__(self, parent=None):
+        """
+        Initialize the KVOverrideEntry widget.
+
+        This method sets up the widget layout, creates and configures input fields,
+        sets up validators, and connects signals to their respective slots.
+
+        Args:
+            parent (QWidget, optional): The parent widget. Defaults to None.
+        """
+
+    def delete_clicked(self):
+        """
+        Handle the delete button click event.
+
+        Emits the 'deleted' signal to notify the parent widget that this entry
+        should be removed.
+        """
+
+    def get_override_string(self, model_name=None, quant_type=None, output_path=None):
+        """
+        Generate a formatted override string with dynamic value substitution.
+
+        This method processes the input fields and replaces any placeholders
+        in the value with their corresponding dynamic values.
+
+        Args:
+            model_name (str, optional): Model name for substitution.
+            quant_type (str, optional): Quantization type for substitution.
+            output_path (str, optional): Output path for substitution.
+
+        Returns:
+            str: Formatted override string in the format "key=type:value".
+        """
+
+    def get_raw_override_string(self):
+        """
+        Generate a raw override string without dynamic substitution.
+
+        Returns:
+            str: Raw override string with placeholders intact, in the format "key=type:value".
+        """
+
+    def update_validator(self, type_):
+        """
+        Update the validator for the value input field based on the selected type.
+
+        This method ensures that the value input adheres to the chosen data type.
+
+        Args:
+            type_ (str): The selected data type ('int', 'float', or 'str').
+        """
--- a/docs/Logger.py
+++ b/docs/Logger.py
@ -0,0 +1,56 @@
+class Logger:
+    """
+    This module provides a custom logger class for logging messages to both the console and a rotating log file.
+
+    The log file will be created in the specified `log_dir` with a timestamp in the filename.
+    The file will rotate when it reaches 10MB, keeping a maximum of 5 backup files.
+    """
+
+    def __init__(self, name, log_dir):
+        """
+        Initializes the logger with a specified name and log directory.
+
+        Args:
+            name (str): The name of the logger.
+            log_dir (str): The directory where log files will be stored.
+        """
+
+    def debug(self, message):
+        """
+        Logs a message with the DEBUG level.
+
+        Args:
+            message (str): The message to log.
+        """
+
+    def info(self, message):
+        """
+        Logs a message with the INFO level.
+
+        Args:
+            message (str): The message to log.
+        """
+
+    def warning(self, message):
+        """
+        Logs a message with the WARNING level.
+
+        Args:
+            message (str): The message to log.
+        """
+
+    def error(self, message):
+        """
+        Logs a message with the ERROR level.
+
+        Args:
+            message (str): The message to log.
+        """
+
+    def critical(self, message):
+        """
+        Logs a message with the CRITICAL level.
+
+        Args:
+            message (str): The message to log.
+        """
--- a/docs/ModelInfoDialog.py
+++ b/docs/ModelInfoDialog.py
@ -0,0 +1,28 @@
+class ModelInfoDialog(QDialog):
+    """
+    A dialog window for displaying model information.
+
+    This class creates a dialog that shows detailed information about a machine learning model,
+    including its architecture, quantization type, and other relevant data.
+
+    Attributes:
+        None
+
+    Args:
+        model_info (dict): A dictionary containing the model's information.
+        parent (QWidget, optional): The parent widget of this dialog. Defaults to None.
+    """
+
+    def format_model_info(self, model_info) -> str:
+        """
+        Formats the model information into HTML for display.
+
+        This method takes the raw model information and converts it into a formatted HTML string,
+        which can be displayed in the dialog's QTextEdit widget.
+
+        Args:
+            model_info (dict): A dictionary containing the model's information.
+
+        Returns:
+            str: Formatted HTML string containing the model information.
+        """
--- a/docs/QuantizationThread.py
+++ b/docs/QuantizationThread.py
@ -0,0 +1,64 @@
+class QuantizationThread(QThread):
+    """
+    QuantizationThread is a PyQt6-based thread for managing model quantization processes.
+
+    This class provides functionality for:
+    - Running quantization commands as subprocesses
+    - Parsing and emitting model information during quantization
+    - Logging quantization output to a file
+    - Communicating process status, output, and errors to the main thread
+
+    The thread manages the execution of quantization commands, monitors their output,
+    and parses relevant model information. It uses Qt signals to communicate various
+    events and data back to the main application thread.
+
+    Attributes:
+        output_signal (pyqtSignal): Signal emitting subprocess output lines.
+        status_signal (pyqtSignal): Signal for updating quantization status.
+        finished_signal (pyqtSignal): Signal emitted when quantization is complete.
+        error_signal (pyqtSignal): Signal for reporting errors during quantization.
+        model_info_signal (pyqtSignal): Signal for sending parsed model information.
+
+    Methods:
+        run(): Executes the quantization process and manages its lifecycle.
+        parse_model_info(line: str): Parses output lines for model information.
+        terminate(): Safely terminates the running subprocess.
+    """
+
+    def __init__(self, command, cwd, log_file):
+        """
+        Initialize the QuantizationThread.
+
+        Args:
+            command (list): The command to execute for quantization.
+            cwd (str): The working directory for the subprocess.
+            log_file (str): Path to the file where output will be logged.
+        """
+
+    def run(self):
+        """
+        Execute the quantization process.
+
+        This method runs the subprocess, captures its output, logs it,
+        parses model information, and emits signals for status updates.
+        It handles process completion and any exceptions that occur.
+        """
+
+    def parse_model_info(self, line):
+        """
+        Parse a line of subprocess output for model information.
+
+        This method extracts various pieces of model information from
+        the output lines and stores them in the model_info dictionary.
+
+        Args:
+            line (str): A line of output from the quantization process.
+        """
+
+    def terminate(self):
+        """
+        Terminate the running subprocess.
+
+        This method safely terminates the quantization process if it's
+        still running, using SIGTERM first and SIGKILL if necessary.
+        """
--- a/docs/convert_lora_to_ggml.py
+++ b/docs/convert_lora_to_ggml.py
@ -0,0 +1,32 @@
+"""
+Convert PEFT LoRA adapters to GGML format.
+
+This script converts Hugging Face PEFT LoRA adapter files to the GGML format
+used by llama.cpp and related projects. It reads the adapter configuration
+from 'adapter_config.json' and the model weights from 'adapter_model.bin'
+or 'adapter_model.safetensors', then writes the converted model to
+'ggml-adapter-model.bin' in the same directory.
+
+Usage:
+    python lora_to_gguf.py <path> [arch]
+
+Arguments:
+    path: Directory containing the PEFT LoRA files
+    arch: Model architecture (default: llama)
+
+The script supports various model architectures and handles both PyTorch
+and safetensors formats for input weights. It performs necessary tensor
+transformations and writes the output in the GGML binary format.
+
+Requirements:
+    - Python 3.6+
+    - numpy
+    - torch
+    - safetensors (optional, for safetensors input)
+
+The script also requires the GGUF Python module, which should be in the
+'gguf-py/gguf' subdirectory relative to this script's location.
+
+Note: This script is designed for use with llama.cpp and related projects.
+Ensure compatibility with your target application when using the output.
+"""
--- a/docs/convert_lora_to_gguf.py
+++ b/docs/convert_lora_to_gguf.py
@ -0,0 +1,40 @@
+"""
+LoRA to GGUF Converter
+
+This script converts a Hugging Face PEFT LoRA adapter to a GGML-compatible file format.
+
+Key features:
+- Supports various output formats (f32, f16, bf16, q8_0, auto)
+- Handles big-endian and little-endian architectures
+- Provides options for lazy evaluation and verbose output
+- Combines base model information with LoRA adapters
+
+Classes:
+    PartialLoraTensor: Dataclass for storing partial LoRA tensor information.
+    LoraTorchTensor: Custom tensor class for LoRA operations and transformations.
+    LoraModel: Extends the base model class to incorporate LoRA-specific functionality.
+
+Functions:
+    get_base_tensor_name: Extracts the base tensor name from a LoRA tensor name.
+    pyinstaller_include: Placeholder for PyInstaller import handling.
+    parse_args: Parses command-line arguments for the script.
+
+Usage:
+    python lora_to_gguf.py --base <base_model_path> <lora_adapter_path> [options]
+
+Arguments:
+    --base: Path to the directory containing the base model file (required)
+    lora_path: Path to the directory containing the LoRA adapter file (required)
+    --outfile: Path to write the output file (optional)
+    --outtype: Output format (f32, f16, bf16, q8_0, auto; default: f16)
+    --bigendian: Flag to indicate big-endian machine execution
+    --no-lazy: Disable lazy evaluation (uses more RAM)
+    --verbose: Increase output verbosity
+    --dry-run: Perform a dry run without writing files
+
+The script processes LoRA adapters, combines them with base model information,
+and generates a GGML-compatible file for use in various applications.
+
+Note: This script requires specific dependencies like torch, gguf, and safetensors.
+Ensure all required libraries are installed before running the script.
+"""
--- a/docs/lora_conversion.py
+++ b/docs/lora_conversion.py
@ -0,0 +1,17 @@
+def convert_lora(self):
+    """Converts a LORA file to either GGML or GGUF format.
+
+    This function initiates the conversion process based on user input,
+    utilizing a separate thread for the actual conversion and providing
+    progress updates in the UI.
+
+    It validates input paths, constructs the conversion command, creates
+    a log file, manages the conversion thread, and handles errors.
+
+    Args:
+        self: The object instance.
+
+    Raises:
+        ValueError: If required input paths are missing.
+
+    """
--- a/plugins/example.py
+++ b/plugins/example.py
@ -0,0 +1,13 @@
+class ExamplePlugin:
+    def init(self, autogguf_instance):
+        # This gets called after the plugin is loaded
+        print("Plugin initialized")
+
+    def __data__(self):
+        return {
+            "name": "ExamplePlugin",
+            "description": "This is an example plugin.",
+            "compatible_versions": ["*"],
+            "author": "leafspark",
+            "version": "v1.0.0",
+        }
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,14 @@
-PyQt6>=6.0.0,<7.0.0
-psutil>=5.8.0,<6.0.0
-requests>=2.25.0,<3.0.0
-numpy>=1.20.0,<2.0.0
-torch>=1.9.0,<2.0.0
+PyYAML~=6.0.2
+psutil~=7.0.0
+pynvml~=12.0.0
+PySide6~=6.9.1
+safetensors~=0.5.3
+numpy<2.0.0
+torch~=2.7.0
+sentencepiece~=0.2.0
+setuptools~=80.7.1
+huggingface-hub~=0.33.1
+transformers~=4.51.3
+fastapi~=0.115.12
+uvicorn~=0.34.2
+certifi~=2025.4.26
--- a/run.sh
+++ b/run.sh
@ -0,0 +1,31 @@
+#!/bin/sh
+
+# Check if Python is installed
+if ! command -v python3 >/dev/null 2>&1; then
+    echo "Error: Python 3 is not installed or not in the PATH."
+    echo "Please install Python 3 and try again."
+    exit 1
+fi
+
+# Set environment variables
+export PYTHONIOENCODING=utf-8
+export AUTOGGUF_LANGUAGE=en-US
+
+# Try to run main.py in the current directory
+if [ -f "main.py" ]; then
+    echo "Running main.py in the current directory..."
+    python3 main.py
+    exit 0
+fi
+
+# If main.py doesn't exist in the current directory, try src/main.py
+if [ -f "src/main.py" ]; then
+    echo "Running src/main.py..."
+    python3 src/main.py
+    exit 0
+fi
+
+# If neither file is found, display an error message
+echo "Error: Neither main.py nor src/main.py found."
+echo "Please make sure the script is in the correct directory."
+exit 1
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,17 @@
+from setuptools import setup
+
+with open("requirements.txt") as f:
+    required = f.read().splitlines()
+
+setup(
+    name="AutoGGUF",
+    version="v2.0.1",
+    packages=[""],
+    url="https://github.com/leafspark/AutoGGUF",
+    license="apache-2.0",
+    author="leafspark",
+    author_email="leafspark@proton.me",
+    description="automatically quant GGUF models",
+    install_requires=required,
+    entry_points={"console_scripts": ["autogguf-gui = main:main"]},
+)
--- a/src/AutoGGUF.py
+++ b/src/AutoGGUF.py
--- a/src/CustomTitleBar.py
+++ b/src/CustomTitleBar.py
@ -0,0 +1,112 @@
+from PySide6.QtCore import QPoint, Qt
+from PySide6.QtWidgets import QHBoxLayout, QLabel, QMenuBar, QPushButton, QWidget
+
+
+class CustomTitleBar(QWidget):
+    def __init__(self, parent=None) -> None:
+        super().__init__(parent)
+        self.parent = parent
+        layout = QHBoxLayout(self)
+        layout.setContentsMargins(10, 5, 10, 5)
+
+        # Add the favicon
+        # TODO: uncomment this
+        # self.icon_label = QLabel()
+        # self.icon_label.setPixmap(QPixmap(resource_path("assets/favicon.ico")))
+        # layout.addWidget(self.icon_label)
+
+        # Add app title (bolded)
+        self.title = QLabel("<b>AutoGGUF</b>")  # Use HTML tags for bolding
+        layout.addWidget(self.title)
+
+        # Add menubar here
+        self.menubar = QMenuBar()
+        layout.addWidget(self.menubar)  # Add menubar to the layout
+
+        layout.addStretch(1)  # This pushes the buttons to the right
+
+        # Add minimize and close buttons
+        self.minimize_button = QPushButton("—")
+        self.close_button = QPushButton("✕")
+
+        for button in (self.minimize_button, self.close_button):
+            button.setFixedSize(30, 30)
+            button.setStyleSheet(
+                """
+                QPushButton {
+                    border: none;
+                    background-color: transparent;
+                }
+                QPushButton:hover {
+                    background-color: rgba(255, 255, 255, 0.1);
+                }
+            """
+            )
+
+        # Enable mouse tracking for smoother movement
+        self.setMouseTracking(True)
+
+        # Add maximize button
+        self.maximize_button = QPushButton("□")
+        self.maximize_button.setFixedSize(30, 30)
+        self.maximize_button.setStyleSheet(
+            """
+            QPushButton {
+                border: none;
+                background-color: transparent;
+                padding: 2px;
+                font-size: 15px;
+            }
+            QPushButton:hover {
+                background-color: rgba(255, 255, 255, 0.1);
+            }
+        """
+        )
+        self.maximize_button.clicked.connect(self.toggle_maximize)
+
+        layout.addWidget(self.minimize_button)
+        layout.addWidget(self.maximize_button)
+        layout.addWidget(self.close_button)
+
+        self.minimize_button.clicked.connect(self.parent.showMinimized)
+        self.close_button.clicked.connect(self.parent.close)
+
+        self.start = QPoint(0, 0)
+        self.pressing = False
+        self.isMaximized = False  # Flag to track maximization state
+        self.normal_size = None  # Store the normal window size
+
+    def mousePressEvent(self, event) -> None:
+        if event.button() == Qt.LeftButton:
+            self.start = event.globalPos() - self.parent.frameGeometry().topLeft()
+            self.pressing = True
+
+    def mouseMoveEvent(self, event) -> None:
+        if self.pressing:
+            new_pos = event.globalPos() - self.start
+            screen = self.parent.screen()
+            screen_geo = screen.availableGeometry()
+
+            # Check if the new position would put the titlebar below the taskbar
+            if (
+                new_pos.y() + self.parent.height() > screen_geo.bottom()
+            ):  # Use screen_geo.bottom()
+                new_pos.setY(screen_geo.bottom() - self.parent.height())
+
+            self.parent.move(new_pos)
+
+    def mouseReleaseEvent(self, event) -> None:
+        self.pressing = False
+
+    def toggle_maximize(self) -> None:
+        if self.isMaximized:
+            self.parent.showNormal()
+            if self.normal_size:
+                self.parent.resize(self.normal_size)
+            self.maximize_button.setText("□")  # Change back to maximize symbol
+            self.isMaximized = False
+        else:
+            self.normal_size = self.parent.size()  # Store the current size
+            self.parent.showMaximized()
+            self.maximize_button.setText("❐")  # Change to restore symbol
+            self.isMaximized = True
--- a/src/DownloadThread.py
+++ b/src/DownloadThread.py
@ -1,54 +1,60 @@
-from PyQt6.QtWidgets import *
-from PyQt6.QtCore import *
-from PyQt6.QtGui import *
-import os
-import sys
-import psutil
-import subprocess
-import time
-import signal
-import json
-import platform
-import requests
-import zipfile
-from datetime import datetime
-
-class DownloadThread(QThread):
-    progress_signal = pyqtSignal(int)
-    finished_signal = pyqtSignal(str)
-    error_signal = pyqtSignal(str)
-
-    def __init__(self, url, save_path):
-        super().__init__()
-        self.url = url
-        self.save_path = save_path
-
-    def run(self):
-        try:
-            response = requests.get(self.url, stream=True)
-            response.raise_for_status()
-            total_size = int(response.headers.get('content-length', 0))
-            block_size = 8192
-            downloaded = 0
-
-            with open(self.save_path, 'wb') as file:
-                for data in response.iter_content(block_size):
-                    size = file.write(data)
-                    downloaded += size
-                    if total_size:
-                        progress = int((downloaded / total_size) * 100)
-                        self.progress_signal.emit(progress)
-
-            # Extract the downloaded zip file
-            extract_dir = os.path.splitext(self.save_path)[0]
-            with zipfile.ZipFile(self.save_path, 'r') as zip_ref:
-                zip_ref.extractall(extract_dir)
-
-            # Remove the zip file after extraction
-            os.remove(self.save_path)
-
-            self.finished_signal.emit(extract_dir)
-        except Exception as e:
-            self.error_signal.emit(str(e))
-            if os.path.exists(self.save_path):
-                os.remove(self.save_path)
+import os
+import urllib.request
+import urllib.error
+import zipfile
+import ssl
+import certifi
+from PySide6.QtCore import QThread, Signal
+
+
+class DownloadThread(QThread):
+    progress_signal = Signal(int)
+    finished_signal = Signal(str)
+    error_signal = Signal(str)
+
+    def __init__(self, url, save_path) -> None:
+        super().__init__()
+        self.url = url
+        self.save_path = save_path
+
+    def run(self) -> None:
+        try:
+            req = urllib.request.Request(self.url)
+
+            # Create SSL context with certifi certificates
+            ssl_context = ssl.create_default_context(cafile=certifi.where())
+
+            with urllib.request.urlopen(req, context=ssl_context) as response:
+                if response.status != 200:
+                    raise urllib.error.HTTPError(
+                        self.url, response.status, "HTTP Error", response.headers, None
+                    )
+
+                total_size = int(response.headers.get("Content-Length", 0))
+                block_size = 8192
+                downloaded = 0
+
+                with open(self.save_path, "wb") as file:
+                    while True:
+                        data = response.read(block_size)
+                        if not data:
+                            break
+                        size = file.write(data)
+                        downloaded += size
+                        if total_size:
+                            progress = int((downloaded / total_size) * 100)
+                            self.progress_signal.emit(progress)
+
+            # Extract the downloaded zip file
+            extract_dir = os.path.splitext(self.save_path)[0]
+            with zipfile.ZipFile(self.save_path, "r") as zip_ref:
+                zip_ref.extractall(extract_dir)
+
+            # Remove the zip file after extraction
+            os.remove(self.save_path)
+
+            self.finished_signal.emit(extract_dir)
+        except Exception as e:
+            self.error_signal.emit(str(e))
+            if os.path.exists(self.save_path):
+                os.remove(self.save_path)
--- a/src/GPUMonitor.py
+++ b/src/GPUMonitor.py
@ -0,0 +1,240 @@
+import pynvml
+from PySide6.QtCore import QTimer
+from PySide6.QtGui import QPainter, QPen, QColor
+from PySide6.QtWidgets import (
+    QWidget,
+    QHBoxLayout,
+    QVBoxLayout,
+    QProgressBar,
+    QLabel,
+    QDialog,
+    QTabWidget,
+    QGraphicsView,
+    QGraphicsScene,
+    QGraphicsLineItem,
+    QComboBox,
+)
+
+from Localizations import (
+    GPU_USAGE_FORMAT,
+    GPU_DETAILS,
+    GPU_USAGE_OVER_TIME,
+    VRAM_USAGE_OVER_TIME,
+    NO_GPU_DETECTED,
+    AMD_GPU_NOT_SUPPORTED,
+    CPU_USAGE_OVER_TIME,
+    RAM_USAGE_OVER_TIME,
+)
+
+from ui_update import animate_bar
+
+
+class SimpleGraph(QGraphicsView):
+    def __init__(self, title, parent=None) -> None:
+        super().__init__(parent)
+        self.setScene(QGraphicsScene(self))
+        self.setRenderHint(QPainter.RenderHint.Antialiasing)
+
+        self.setMinimumHeight(200)
+        self.title = title
+        self.data = []
+
+    def update_data(self, data) -> None:
+        self.data = data
+        self.scene().clear()
+        if not self.data:
+            return
+
+        width = self.width() - 40
+        height = self.height() - 40
+        max_value = 100  # Fixed to 100% for GPU usage
+
+        # Draw axes
+        self.scene().addLine(20, height + 20, width + 20, height + 20)
+        self.scene().addLine(20, 20, 20, height + 20)
+
+        # Draw title
+        self.scene().addText(self.title).setPos(width // 2, 0)
+
+        # Draw graph
+        path = QPen(QColor(0, 120, 212), 2)  # Blue color, 2px width
+        for i in range(1, len(self.data)):
+            x1 = 20 + (i - 1) * width / (len(self.data) - 1)
+            y1 = 20 + height - (self.data[i - 1] * height / max_value)
+            x2 = 20 + i * width / (len(self.data) - 1)
+            y2 = 20 + height - (self.data[i] * height / max_value)
+            line = QGraphicsLineItem(x1, y1, x2, y2)
+            line.setPen(path)
+            self.scene().addItem(line)
+
+    def resizeEvent(self, event) -> None:
+        super().resizeEvent(event)
+        self.update_data(self.data)
+
+
+class GPUMonitor(QWidget):
+    def __init__(self, parent=None) -> None:
+        super().__init__(parent)
+        self.setMinimumHeight(30)
+        self.setMaximumHeight(30)
+
+        layout = QHBoxLayout(self)
+        layout.setContentsMargins(0, 0, 0, 0)
+
+        self.gpu_selector = QComboBox()
+        self.gpu_selector.setVisible(False)
+        self.gpu_selector.currentIndexChanged.connect(self.change_gpu)
+        layout.addWidget(self.gpu_selector)
+
+        self.gpu_bar = QProgressBar()
+        self.gpu_bar.setTextVisible(False)
+        layout.addWidget(self.gpu_bar)
+
+        self.gpu_label = QLabel()
+        layout.addWidget(self.gpu_label)
+
+        self.timer = QTimer(self)
+        self.timer.timeout.connect(self.update_gpu_info)
+        self.timer.start(500)  # Update every 0.5 seconds
+
+        self.gpu_data = []
+        self.vram_data = []
+
+        self.handles = []
+        self.current_gpu = 0
+
+        try:
+            pynvml.nvmlInit()
+            device_count = pynvml.nvmlDeviceGetCount()
+            for i in range(device_count):
+                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+                name = pynvml.nvmlDeviceGetName(handle)
+                # Handle both string and bytes cases
+                if isinstance(name, bytes):
+                    name = name.decode("utf-8")
+                self.handles.append(handle)
+                self.gpu_selector.addItem(f"NVIDIA GPU {i}: {name}")
+
+            if device_count > 1:
+                self.gpu_selector.setVisible(True)
+
+            if device_count == 0:
+                self.check_for_amd_gpu()
+
+        except pynvml.NVMLError:
+            self.check_for_amd_gpu()
+
+        if not self.handles:
+            self.gpu_label.setText(NO_GPU_DETECTED)
+
+    def check_for_amd_gpu(self) -> None:
+        # This is a placeholder. Implementing AMD GPU detection would require
+        # platform-specific methods or additional libraries.
+        self.gpu_label.setText(AMD_GPU_NOT_SUPPORTED)
+
+    def change_gpu(self, index) -> None:
+        self.current_gpu = index
+        self.gpu_data.clear()
+        self.vram_data.clear()
+
+    def update_gpu_info(self) -> None:
+        if self.handles:
+            try:
+                handle = self.handles[self.current_gpu]
+                utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
+                memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
+
+                gpu_usage = utilization.gpu
+                vram_usage = (memory.used / memory.total) * 100
+
+                animate_bar(self, self.gpu_bar, int(vram_usage))
+                self.gpu_label.setText(
+                    GPU_USAGE_FORMAT.format(
+                        gpu_usage,
+                        vram_usage,
+                        memory.used // 1024 // 1024,
+                        memory.total // 1024 // 1024,
+                    )
+                )
+
+                self.gpu_data.append(gpu_usage)
+                self.vram_data.append(vram_usage)
+
+                if len(self.gpu_data) > 60:
+                    self.gpu_data.pop(0)
+                    self.vram_data.pop(0)
+            except pynvml.NVMLError:
+                self.gpu_bar.setValue(0)
+                self.gpu_label.setText(GPU_USAGE_FORMAT.format(0, 0, 0, 0))
+
+    def mouseDoubleClickEvent(self, event) -> None:
+        if self.handles:
+            self.show_detailed_stats()
+
+    def show_ram_graph(self, event) -> None:
+        self.show_detailed_stats_std(RAM_USAGE_OVER_TIME, self.ram_data)
+
+    def show_cpu_graph(self, event) -> None:
+        self.show_detailed_stats_std(CPU_USAGE_OVER_TIME, self.cpu_data)
+
+    def show_detailed_stats_std(self, title, data) -> None:
+        dialog = QDialog(self)
+        dialog.setWindowTitle(title)
+        dialog.setMinimumSize(800, 600)
+
+        layout = QVBoxLayout(dialog)
+
+        graph = SimpleGraph(title)
+        layout.addWidget(graph)
+
+        def update_graph_data() -> None:
+            graph.update_data(data)
+
+        timer = QTimer(dialog)
+        timer.timeout.connect(update_graph_data)
+        timer.start(500)  # Update every 0.5 seconds
+
+        dialog.exec()
+
+    def show_detailed_stats(self) -> None:
+        dialog = QDialog(self)
+        dialog.setWindowTitle(GPU_DETAILS)
+        dialog.setMinimumSize(800, 600)
+
+        layout = QVBoxLayout(dialog)
+
+        if len(self.handles) > 1:
+            gpu_selector = QComboBox()
+            gpu_selector.addItems(
+                [
+                    self.gpu_selector.itemText(i)
+                    for i in range(self.gpu_selector.count())
+                ]
+            )
+            gpu_selector.setCurrentIndex(self.current_gpu)
+            gpu_selector.currentIndexChanged.connect(self.change_gpu)
+            layout.addWidget(gpu_selector)
+
+        tab_widget = QTabWidget()
+        layout.addWidget(tab_widget)
+
+        gpu_graph = SimpleGraph(GPU_USAGE_OVER_TIME)
+        vram_graph = SimpleGraph(VRAM_USAGE_OVER_TIME)
+
+        def update_graph_data() -> None:
+            gpu_graph.update_data(self.gpu_data)
+            vram_graph.update_data(self.vram_data)
+
+        timer = QTimer(dialog)
+        timer.timeout.connect(update_graph_data)
+        timer.start(500)  # Update every 0.5 seconds
+
+        tab_widget.addTab(gpu_graph, GPU_USAGE_OVER_TIME)
+        tab_widget.addTab(vram_graph, VRAM_USAGE_OVER_TIME)
+
+        dialog.exec()
+
+    def closeEvent(self, event) -> None:
+        if self.handles:
+            pynvml.nvmlShutdown()
+        super().closeEvent(event)
--- a/src/KVOverrideEntry.py
+++ b/src/KVOverrideEntry.py
@ -1,83 +1,122 @@
-from PyQt6.QtWidgets import QWidget, QHBoxLayout, QLineEdit, QComboBox, QPushButton
-from PyQt6.QtCore import pyqtSignal, QRegularExpression
-from PyQt6.QtGui import QDoubleValidator, QIntValidator, QRegularExpressionValidator
-from datetime import datetime
-import time
-import os
-import socket
-import platform
-
-class KVOverrideEntry(QWidget):
-    deleted = pyqtSignal(QWidget)
-
-    def __init__(self, parent=None):
-        super().__init__(parent)
-        layout = QHBoxLayout(self)
-        layout.setContentsMargins(0, 0, 0, 0)
-
-        self.key_input = QLineEdit()
-        self.key_input.setPlaceholderText("Key")
-        # Set validator for key input (letters and dots only)
-        key_validator = QRegularExpressionValidator(QRegularExpression(r"[A-Za-z.]+"))
-        self.key_input.setValidator(key_validator)
-        layout.addWidget(self.key_input)
-
-        self.type_combo = QComboBox()
-        self.type_combo.addItems(["int", "str", "float"])
-        layout.addWidget(self.type_combo)
-
-        self.value_input = QLineEdit()
-        self.value_input.setPlaceholderText("Value")
-        layout.addWidget(self.value_input)
-
-        delete_button = QPushButton("X")
-        delete_button.setFixedSize(30, 30)
-        delete_button.clicked.connect(self.delete_clicked)
-        layout.addWidget(delete_button)
-
-        # Connect type change to validator update
-        self.type_combo.currentTextChanged.connect(self.update_validator)
-
-        # Initialize validator
-        self.update_validator(self.type_combo.currentText())
-
-    def delete_clicked(self):
-        self.deleted.emit(self)
-
-    def get_override_string(self, model_name=None, quant_type=None, output_path=None):  # Add arguments
-        key = self.key_input.text()
-        type_ = self.type_combo.currentText()
-        value = self.value_input.text()
-
-        dynamic_params = {
-            "{system.time.milliseconds}": lambda: str(int(time.time() * 1000)),
-            "{system.time.seconds}": lambda: str(int(time.time())),
-            "{system.date.iso}": lambda: datetime.now().strftime("%Y-%m-%d"),
-            "{system.datetime.iso}": lambda: datetime.now().isoformat(),
-            "{system.username}": lambda: os.getlogin(),
-            "{system.hostname}": lambda: socket.gethostname(),
-            "{system.platform}": lambda: platform.system(),
-            "{system.python.version}": lambda: platform.python_version(),
-            "{system.time.milliseconds}": lambda: str(int(time.time() * 1000)),
-            "{system.date}": lambda: datetime.now().strftime("%Y-%m-%d"),
-            "{model.name}": lambda: model_name if model_name is not None else "Unknown Model",
-            "{quant.type}": lambda: quant_type if quant_type is not None else "Unknown Quant",
-            "{output.path}": lambda: output_path if output_path is not None else "Unknown Output Path",
-        }
-
-        for param, func in dynamic_params.items():
-            value = value.replace(param, func())
-
-        return f"{key}={type_}:{value}"
-        
-    def get_raw_override_string(self):
-        # Return the raw override string with placeholders intact
-        return f"{self.key_input.text()}={self.type_combo.currentText()}:{self.value_input.text()}"    
-
-    def update_validator(self, type_):
-        if type_ == "int":
-            self.value_input.setValidator(QIntValidator())
-        elif type_ == "float":
-            self.value_input.setValidator(QDoubleValidator())
-        else:  # str
-            self.value_input.setValidator(None)
+import locale
+import os
+import platform
+import shutil
+import socket
+import time
+from datetime import datetime
+
+import psutil
+from PySide6.QtCore import QRegularExpression, Signal
+from PySide6.QtGui import QDoubleValidator, QIntValidator, QRegularExpressionValidator
+from PySide6.QtWidgets import QComboBox, QHBoxLayout, QLineEdit, QPushButton, QWidget
+
+
+class KVOverrideEntry(QWidget):
+    deleted = Signal(QWidget)
+
+    def __init__(self, parent=None) -> None:
+        super().__init__(parent)
+        layout = QHBoxLayout(self)
+        layout.setContentsMargins(0, 0, 0, 0)
+
+        self.key_input = QLineEdit()
+        self.key_input.setPlaceholderText("Key")
+
+        # Set validator for key input (letters and dots only)
+        key_validator = QRegularExpressionValidator(QRegularExpression(r"[A-Za-z.]+"))
+        self.key_input.setValidator(key_validator)
+        layout.addWidget(self.key_input)
+
+        self.type_combo = QComboBox()
+        self.type_combo.addItems(["int", "str", "float", "u32", "i32"])
+        layout.addWidget(self.type_combo)
+
+        self.value_input = QLineEdit()
+        self.value_input.setPlaceholderText("Value")
+        layout.addWidget(self.value_input)
+
+        delete_button = QPushButton("✕")
+        delete_button.setFixedSize(30, 30)
+        delete_button.clicked.connect(self.delete_clicked)
+        layout.addWidget(delete_button)
+
+        # Connect type change to validator update
+        self.type_combo.currentTextChanged.connect(self.update_validator)
+
+        # Initialize validator
+        self.update_validator(self.type_combo.currentText())
+
+    def delete_clicked(self) -> None:
+        self.deleted.emit(self)
+
+    def get_override_string(
+        self,
+        model_name=None,
+        quant_type=None,
+        output_path=None,
+        quantization_parameters=None,
+    ) -> str:  # Add arguments
+        key = self.key_input.text()
+        type_ = self.type_combo.currentText()
+        value = self.value_input.text()
+
+        dynamic_params = {
+            "{system.time.milliseconds}": lambda: str(int(time.time() * 1000)),
+            "{system.time.seconds}": lambda: str(int(time.time())),
+            "{system.date.iso}": lambda: datetime.now().strftime("%Y-%m-%d"),
+            "{system.datetime.iso}": lambda: datetime.now().isoformat(),
+            "{system.username}": lambda: os.getlogin(),
+            "{system.hostname}": lambda: socket.gethostname(),
+            "{system.platform}": lambda: platform.system(),
+            "{system.python.version}": lambda: platform.python_version(),
+            "{system.timezone}": lambda: time.tzname[time.daylight],
+            "{system.cpus}": lambda: str(os.cpu_count()),
+            "{system.memory.total}": lambda: str(psutil.virtual_memory().total),
+            "{system.memory.free}": lambda: str(psutil.virtual_memory().free),
+            "{system.filesystem.used}": lambda: str(shutil.disk_usage("/").used),
+            "{system.kernel.version}": lambda: platform.release(),
+            "{system.locale}": lambda: locale.getdefaultlocale()[0],
+            "{process.nice}": lambda: str(os.nice(0)),
+            "{model.name}": lambda: (
+                model_name if model_name is not None else "Unknown Model"
+            ),
+            "{quant.type}": lambda: (
+                quant_type if quant_type is not None else "Unknown Quant"
+            ),
+            "{output.path}": lambda: (
+                output_path if output_path is not None else "Unknown Output Path"
+            ),
+            "{quant.kv}": lambda: (
+                quantization_parameters[0]
+                if quantization_parameters is not None
+                else False
+            ),
+            "{quant.requantized}": lambda: (
+                quantization_parameters[1]
+                if quantization_parameters is not None
+                else False
+            ),
+            "{quant.leave_output_tensor}": lambda: (
+                quantization_parameters[2]
+                if quantization_parameters is not None
+                else False
+            ),
+        }
+
+        for param, func in dynamic_params.items():
+            value = value.replace(param, func())
+
+        return f"{key}={type_}:{value}"
+
+    def get_raw_override_string(self) -> str:
+        # Return the raw override string with placeholders intact
+        return f"{self.key_input.text()}={self.type_combo.currentText()}:{self.value_input.text()}"
+
+    def update_validator(self, type_) -> None:
+        if type_ == "int":
+            self.value_input.setValidator(QIntValidator())
+        elif type_ == "float":
+            self.value_input.setValidator(QDoubleValidator())
+        else:  # str
+            self.value_input.setValidator(None)
--- a/src/Localizations.py
+++ b/src/Localizations.py
--- a/src/Logger.py
+++ b/src/Logger.py
@ -1,46 +1,50 @@
-import logging
-from logging.handlers import RotatingFileHandler
-import os
-import sys
-from datetime import datetime
-
-class Logger:
-    def __init__(self, name, log_dir):
-        self.logger = logging.getLogger(name)
-        self.logger.setLevel(logging.DEBUG)
-
-        # Create logs directory if it doesn't exist
-        os.makedirs(log_dir, exist_ok=True)
-
-        # Console handler
-        console_handler = logging.StreamHandler()
-        console_handler.setLevel(logging.INFO)
-        console_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        console_handler.setFormatter(console_format)
-
-        # File handler
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        log_file = os.path.join(log_dir, f"latest_{timestamp}.log")
-        file_handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5, encoding='utf-8')
-        file_handler.setLevel(logging.DEBUG)
-        file_format = logging.Formatter('%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s')
-        file_handler.setFormatter(file_format)
-
-        # Add handlers to logger
-        self.logger.addHandler(console_handler)
-        self.logger.addHandler(file_handler)
-
-    def debug(self, message):
-        self.logger.debug(message)
-
-    def info(self, message):
-        self.logger.info(message)
-
-    def warning(self, message):
-        self.logger.warning(message)
-
-    def error(self, message):
-        self.logger.error(message)
-
-    def critical(self, message):
-        self.logger.critical(message)
+import logging
+from logging.handlers import RotatingFileHandler
+import os
+from datetime import datetime
+
+
+class Logger:
+    def __init__(self, name, log_dir) -> None:
+        self.logger = logging.getLogger(name)
+        self.logger.setLevel(logging.DEBUG)
+
+        # Create logs directory if it doesn't exist
+        os.makedirs(log_dir, exist_ok=True)
+
+        # Console handler
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(logging.INFO)
+        console_format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+        console_handler.setFormatter(console_format)
+
+        # File handler
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        log_file = os.path.join(log_dir, f"latest_{timestamp}.log")
+        file_handler = RotatingFileHandler(
+            log_file, maxBytes=10 * 1024 * 1024, backupCount=5, encoding="utf-8"
+        )
+        file_handler.setLevel(logging.DEBUG)
+        file_format = logging.Formatter(
+            "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s"
+        )
+        file_handler.setFormatter(file_format)
+
+        # Add handlers to logger
+        self.logger.addHandler(console_handler)
+        self.logger.addHandler(file_handler)
+
+    def debug(self, message) -> None:
+        self.logger.debug(message)
+
+    def info(self, message) -> None:
+        self.logger.info(message)
+
+    def warning(self, message) -> None:
+        self.logger.warning(message)
+
+    def error(self, message) -> None:
+        self.logger.error(message)
+
+    def critical(self, message) -> None:
+        self.logger.critical(message)
--- a/src/ModelInfoDialog.py
+++ b/src/ModelInfoDialog.py
@ -1,48 +1,48 @@
-from PyQt6.QtWidgets import *
-from PyQt6.QtCore import *
-from PyQt6.QtGui import *
-import os
-import sys
-import psutil
-import subprocess
-import time
-import signal
-import json
-import platform
-import requests
-import zipfile
-from datetime import datetime
-
-class ModelInfoDialog(QDialog):
-    def __init__(self, model_info, parent=None):
-        super().__init__(parent)
-        self.setWindowTitle("Model Information")
-        self.setGeometry(200, 200, 600, 400)
-
-        layout = QVBoxLayout()
-
-        info_text = QTextEdit()
-        info_text.setReadOnly(True)
-        info_text.setHtml(self.format_model_info(model_info))
-
-        layout.addWidget(info_text)
-
-        close_button = QPushButton("Close")
-        close_button.clicked.connect(self.accept)
-        layout.addWidget(close_button)
-
-        self.setLayout(layout)
-
-    def format_model_info(self, model_info):
-        html = "<h2>Model Information</h2>"
-        html += f"<p><b>Architecture:</b> {model_info.get('architecture', 'N/A')}</p>"
-        html += f"<p><b>Quantization Type:</b> {model_info.get('quantization_type', 'N/A')}</p>"
-        html += f"<p><b>KV Pairs:</b> {model_info.get('kv_pairs', 'N/A')}</p>"
-        html += f"<p><b>Tensors:</b> {model_info.get('tensors', 'N/A')}</p>"
-        
-        html += "<h3>Key-Value Pairs:</h3>"
-        for key, value in model_info.get('kv_data', {}).items():
-            html += f"<p><b>{key}:</b> {value}</p>"
-
-        return html        
-        
+from PySide6.QtWidgets import QVBoxLayout, QTextEdit, QDialog, QPushButton
+
+
+class ModelInfoDialog(QDialog):
+    def __init__(self, model_info, parent=None) -> None:
+        super().__init__(parent)
+        self.setWindowTitle("Model Information")
+        self.setGeometry(200, 200, 600, 400)
+
+        layout = QVBoxLayout()
+
+        info_text = QTextEdit()
+        info_text.setReadOnly(True)
+        info_text.setHtml(self.format_model_info(model_info))
+
+        layout.addWidget(info_text)
+
+        close_button = QPushButton("Close")
+        close_button.clicked.connect(self.accept)
+        layout.addWidget(close_button)
+
+        self.setLayout(layout)
+
+    def format_model_info(self, model_info) -> str:
+        html = "<h2>Model Information</h2>"
+        html += f"<p><b>Architecture:</b> {model_info.get('architecture', 'N/A')}</p>"
+
+        # Format quantization types
+        quant_types = model_info.get("quantization_type", [])
+        if quant_types:
+            # Clean up the format: remove "- type " prefix and join with " | "
+            formatted_types = []
+            for qtype in quant_types:
+                # Remove "- type " prefix if present
+                clean_type = qtype.replace("- type ", "").strip()
+                formatted_types.append(clean_type)
+            quant_display = " | ".join(formatted_types)
+        else:
+            quant_display = "N/A"
+
+        html += f"<p><b>Quantization Type:</b> {quant_display}</p>"
+        html += f"<p><b>Tensors:</b> {model_info.get('tensors', 'N/A')}</p>"
+
+        html += "<h3>Key-Value Pairs:</h3>"
+        for key, value in model_info.get("kv_data", {}).items():
+            html += f"<p><b>{key}:</b> {value}</p>"
+
+        return html
--- a/src/Plugins.py
+++ b/src/Plugins.py
@ -0,0 +1,81 @@
+import importlib
+import os
+from typing import Any, Dict
+from Localizations import *
+
+
+class Plugins:
+
+    def load_plugins(self) -> Dict[str, Dict[str, Any]]:
+        plugins = {}
+        plugin_dir = "plugins"
+
+        if not os.path.exists(plugin_dir):
+            self.logger.info(PLUGINS_DIR_NOT_EXIST.format(plugin_dir))
+            return plugins
+
+        if not os.path.isdir(plugin_dir):
+            self.logger.warning(PLUGINS_DIR_NOT_DIRECTORY.format(plugin_dir))
+            return plugins
+
+        for file in os.listdir(plugin_dir):
+            if file.endswith(".py") and not file.endswith(".disabled.py"):
+                name = file[:-3]
+                path = os.path.join(plugin_dir, file)
+
+                try:
+                    spec = importlib.util.spec_from_file_location(name, path)
+                    module = importlib.util.module_from_spec(spec)
+                    spec.loader.exec_module(module)
+
+                    for item_name in dir(module):
+                        item = getattr(module, item_name)
+                        if isinstance(item, type) and hasattr(item, "__data__"):
+                            plugin_instance = item()
+                            plugin_data = plugin_instance.__data__()
+
+                            compatible_versions = plugin_data.get(
+                                "compatible_versions", []
+                            )
+                            if (
+                                "*" in compatible_versions
+                                or AUTOGGUF_VERSION in compatible_versions
+                            ):
+                                plugins[name] = {
+                                    "instance": plugin_instance,
+                                    "data": plugin_data,
+                                }
+                                self.logger.info(
+                                    PLUGIN_LOADED.format(
+                                        plugin_data["name"], plugin_data["version"]
+                                    )
+                                )
+                            else:
+                                self.logger.warning(
+                                    PLUGIN_INCOMPATIBLE.format(
+                                        plugin_data["name"],
+                                        plugin_data["version"],
+                                        AUTOGGUF_VERSION,
+                                        ", ".join(compatible_versions),
+                                    )
+                                )
+                            break
+                except Exception as e:
+                    self.logger.error(PLUGIN_LOAD_FAILED.format(name, str(e)))
+
+        return plugins
+
+    def apply_plugins(self) -> None:
+        if not self.plugins:
+            self.logger.info(NO_PLUGINS_LOADED)
+            return
+
+        for plugin_name, plugin_info in self.plugins.items():
+            plugin_instance = plugin_info["instance"]
+            for attr_name in dir(plugin_instance):
+                if not attr_name.startswith("__") and attr_name != "init":
+                    attr_value = getattr(plugin_instance, attr_name)
+                    setattr(self, attr_name, attr_value)
+
+            if hasattr(plugin_instance, "init") and callable(plugin_instance.init):
+                plugin_instance.init(self)
--- a/src/QuantizationThread.py
+++ b/src/QuantizationThread.py
@ -1,94 +1,155 @@
-from PyQt6.QtWidgets import *
-from PyQt6.QtCore import *
-from PyQt6.QtGui import *
-import os
-import sys
-import psutil
-import subprocess
-import time
-import signal
-import json
-import platform
-import requests
-import zipfile
-import traceback
-from datetime import datetime
-from imports_and_globals import open_file_safe
-
-class QuantizationThread(QThread):
-    # Define custom signals for communication with the main thread
-    output_signal = pyqtSignal(str)
-    status_signal = pyqtSignal(str)
-    finished_signal = pyqtSignal()
-    error_signal = pyqtSignal(str)
-    model_info_signal = pyqtSignal(dict)
-
-    def __init__(self, command, cwd, log_file):
-        super().__init__()
-        self.command = command
-        self.cwd = cwd
-        self.log_file = log_file
-        self.process = None
-        self.model_info = {}
-
-    def run(self):
-        try:
-            # Start the subprocess
-            self.process = subprocess.Popen(
-                self.command,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                text=True,
-                cwd=self.cwd,
-            )
-            # Open log file and process output
-            with open_file_safe(self.log_file, "w") as log:
-                for line in self.process.stdout:
-                    line = line.strip()
-                    self.output_signal.emit(line)
-                    log.write(line + "\n")
-                    log.flush()
-                    self.status_signal.emit("In Progress")
-                    self.parse_model_info(line)
-            
-            # Wait for process to complete
-            self.process.wait()
-            if self.process.returncode == 0:
-                self.status_signal.emit("Completed")
-                self.model_info_signal.emit(self.model_info)
-            else:
-                self.error_signal.emit(
-                    f"Process exited with code {self.process.returncode}"
-                )
-            self.finished_signal.emit()
-        except Exception as e:
-            self.error_signal.emit(str(e))
-
-    def parse_model_info(self, line):
-        # Parse output for model information
-        if "llama_model_loader: loaded meta data with" in line:
-            parts = line.split()
-            self.model_info["kv_pairs"] = parts[6]
-            self.model_info["tensors"] = parts[9]
-        elif "general.architecture" in line:
-            self.model_info["architecture"] = line.split("=")[-1].strip()
-        elif line.startswith("llama_model_loader: - kv"):
-            key = line.split(":")[2].strip()
-            value = line.split("=")[-1].strip()
-            self.model_info.setdefault("kv_data", {})[key] = value
-        elif line.startswith("llama_model_loader: - type"):
-            parts = line.split(":")
-            if len(parts) > 1:
-                quant_type = parts[1].strip()
-                tensors = parts[2].strip().split()[0]
-                self.model_info.setdefault("quantization_type", []).append(
-                    f"{quant_type}: {tensors} tensors"
-                )
-
-    def terminate(self):
-        # Terminate the subprocess if it's still running
-        if self.process:
-            os.kill(self.process.pid, signal.SIGTERM)
-            self.process.wait(timeout=5)
-            if self.process.poll() is None:
-                os.kill(self.process.pid, signal.SIGKILL)
+import os
+import re
+import signal
+import subprocess
+
+from PySide6.QtCore import Signal, QThread
+
+from globals import open_file_safe
+from Localizations import IN_PROGRESS, COMPLETED
+
+
+class QuantizationThread(QThread):
+    # Define custom signals for communication with the main thread
+    output_signal = Signal(str)
+    status_signal = Signal(str)
+    finished_signal = Signal()
+    error_signal = Signal(str)
+    model_info_signal = Signal(dict)
+
+    def __init__(self, command, cwd, log_file) -> None:
+        super().__init__()
+        self.command = command
+        self.cwd = cwd
+        self.log_file = log_file
+        self.process = None
+        self.model_info = {}
+
+    def run(self) -> None:
+        try:
+            # Start the subprocess
+            self.process = subprocess.Popen(
+                self.command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+                cwd=self.cwd,
+            )
+            # Open log file and process output
+            with open_file_safe(self.log_file, "w") as log:
+                for line in self.process.stdout:
+                    line = line.strip()
+                    self.output_signal.emit(line)
+                    log.write(line + "\n")
+                    log.flush()
+                    self.status_signal.emit(IN_PROGRESS)
+                    self.parse_model_info(line)
+
+            # Wait for process to complete
+            self.process.wait()
+            if self.process.returncode == 0:
+                self.status_signal.emit(COMPLETED)
+                self.model_info_signal.emit(self.model_info)
+            else:
+                self.error_signal.emit(
+                    f"Process exited with code {self.process.returncode}"
+                )
+            self.finished_signal.emit()
+        except Exception as e:
+            self.error_signal.emit(str(e))
+
+    def parse_model_info(self, line) -> None:
+        # Mapping of technical keys to human-readable names
+        key_mappings = {
+            "general.architecture": "Architecture",
+            "general.name": "Model Name",
+            "general.file_type": "File Type",
+            "general.quantization_version": "Quantization Version",
+            "llama.block_count": "Layers",
+            "llama.context_length": "Context Length",
+            "llama.embedding_length": "Embedding Size",
+            "llama.feed_forward_length": "Feed Forward Length",
+            "llama.attention.head_count": "Attention Heads",
+            "llama.attention.head_count_kv": "Key-Value Heads",
+            "llama.attention.layer_norm_rms_epsilon": "RMS Norm Epsilon",
+            "llama.rope.freq_base": "RoPE Frequency Base",
+            "llama.rope.dimension_count": "RoPE Dimensions",
+            "llama.vocab_size": "Vocabulary Size",
+            "tokenizer.ggml.model": "Tokenizer Model",
+            "tokenizer.ggml.pre": "Tokenizer Preprocessing",
+            "tokenizer.ggml.tokens": "Tokens",
+            "tokenizer.ggml.token_type": "Token Types",
+            "tokenizer.ggml.merges": "BPE Merges",
+            "tokenizer.ggml.bos_token_id": "Begin of Sequence Token ID",
+            "tokenizer.ggml.eos_token_id": "End of Sequence Token ID",
+            "tokenizer.chat_template": "Chat Template",
+            "tokenizer.ggml.padding_token_id": "Padding Token ID",
+            "tokenizer.ggml.unk_token_id": "Unknown Token ID",
+        }
+
+        # Parse output for model information
+        if "llama_model_loader: loaded meta data with" in line:
+            parts = line.split()
+            self.model_info["kv_pairs"] = parts[6]
+            self.model_info["tensors"] = parts[9]
+        elif "general.architecture" in line:
+            self.model_info["architecture"] = line.split("=")[-1].strip()
+        elif line.startswith("llama_model_loader: - kv") and "=" in line:
+            # Split on '=' and take the parts
+            parts = line.split("=", 1)  # Split only on first '='
+            left_part = parts[0].strip()
+            value = parts[1].strip()
+
+            # Extract key and type from left part
+            # Format: "llama_model_loader: - kv N: key type"
+            kv_parts = left_part.split(":")
+            if len(kv_parts) >= 3:
+                key_type_part = kv_parts[2].strip()  # This is "key type"
+                key = key_type_part.rsplit(" ", 1)[
+                    0
+                ]  # Everything except last word (type)
+
+                # Use human-readable name if available, otherwise use original key
+                display_key = key_mappings.get(key, key)
+
+                self.model_info.setdefault("kv_data", {})[display_key] = value
+        elif line.startswith("llama_model_loader: - type"):
+            parts = line.split(":")
+            if len(parts) > 1:
+                quant_type = parts[1].strip()
+                tensors = parts[2].strip().split()[0]
+                self.model_info.setdefault("quantization_type", []).append(
+                    f"{quant_type}: {tensors} tensors"
+                )
+
+    def parse_progress(self, line, task_item, imatrix_chunks=None) -> None:
+        # Parses the output line for progress information and updates the task item.
+        match = re.search(r"\[\s*(\d+)\s*/\s*(\d+)\s*].*", line)
+
+        if match:
+            current = int(match.group(1))
+            total = int(match.group(2))
+            progress = int((current / total) * 100)
+            task_item.update_progress(progress)
+        else:
+            imatrix_match = re.search(
+                r"compute_imatrix: computing over (\d+) chunks with batch_size \d+",
+                line,
+            )
+            if imatrix_match:
+                imatrix_chunks = int(imatrix_match.group(1))
+            elif imatrix_chunks is not None:
+                if "save_imatrix: stored collected data" in line:
+                    save_match = re.search(r"collected data after (\d+) chunks", line)
+                    if save_match:
+                        saved_chunks = int(save_match.group(1))
+                        progress = int((saved_chunks / self.imatrix_chunks) * 100)
+                        task_item.update_progress(progress)
+
+    def terminate(self) -> None:
+        # Terminate the subprocess if it's still running
+        if self.process:
+            os.kill(self.process.pid, signal.SIGTERM)
+            self.process.wait(timeout=5)
+            if self.process.poll() is None:
+                os.kill(self.process.pid, signal.SIGKILL)
--- a/src/TaskListItem.py
+++ b/src/TaskListItem.py
@ -1,72 +1,201 @@
-from PyQt6.QtWidgets import *
-from PyQt6.QtCore import *
-from PyQt6.QtGui import *
-import os
-import sys
-import psutil
-import subprocess
-import time
-import signal
-import json
-import platform
-import requests
-import zipfile
-from datetime import datetime
-
-class TaskListItem(QWidget):
-    def __init__(self, task_name, log_file, show_progress_bar=True, parent=None):
-        super().__init__(parent)
-        self.task_name = task_name
-        self.log_file = log_file
-        self.status = "Pending"
-        layout = QHBoxLayout(self)
-        self.task_label = QLabel(task_name)
-        self.progress_bar = QProgressBar()
-        self.progress_bar.setRange(0, 100)
-        self.status_label = QLabel(self.status)
-        layout.addWidget(self.task_label)
-        layout.addWidget(self.progress_bar)
-        layout.addWidget(self.status_label)
-
-        # Hide progress bar if show_progress_bar is False
-        self.progress_bar.setVisible(show_progress_bar)
-
-        # Use indeterminate progress bar if not showing percentage
-        if not show_progress_bar:
-            self.progress_bar.setRange(0, 0)
-
-        self.progress_timer = QTimer(self)
-        self.progress_timer.timeout.connect(self.update_progress)
-        self.progress_value = 0
-
-    def update_status(self, status):
-        self.status = status
-        self.status_label.setText(status)
-        if status == "In Progress":
-            # Only start timer if showing percentage progress
-            if self.progress_bar.isVisible():
-                self.progress_bar.setRange(0, 100)
-                self.progress_timer.start(100)
-        elif status == "Completed":
-            self.progress_timer.stop()
-            self.progress_bar.setValue(100)
-        elif status == "Canceled":
-            self.progress_timer.stop()
-            self.progress_bar.setValue(0)
-
-    def set_error(self):
-        self.status = "Error"
-        self.status_label.setText("Error")
-        self.status_label.setStyleSheet("color: red;")
-        self.progress_bar.setRange(0, 100)
-        self.progress_timer.stop()
-
-    def update_progress(self, value=None):
-        if value is not None:
-            # Update progress bar with specific value
-            self.progress_value = value
-            self.progress_bar.setValue(self.progress_value)
-        else:
-            # Increment progress bar for indeterminate progress
-            self.progress_value = (self.progress_value + 1) % 101
-            self.progress_bar.setValue(self.progress_value)
+from typing import List
+
+from PySide6.QtCore import *
+from PySide6.QtGui import QAction
+from PySide6.QtWidgets import *
+
+from Localizations import (
+    DELETING_TASK,
+    CANCELLING_TASK,
+    CONFIRM_DELETION_TITLE,
+    CONFIRM_DELETION,
+    SHOWING_TASK_CONTEXT_MENU,
+    CANCELED,
+    CANCEL,
+    PROPERTIES,
+    COMPLETED,
+    SHOWING_PROPERTIES_FOR_TASK,
+    DELETE,
+    RESTART,
+    IN_PROGRESS,
+    ERROR,
+    RESTARTING_TASK,
+)
+from ModelInfoDialog import ModelInfoDialog
+from QuantizationThread import QuantizationThread
+from Logger import Logger
+from error_handling import handle_error
+
+
+class TaskListItem(QWidget):
+    def __init__(
+        self,
+        task_name,
+        log_file,
+        show_progress_bar=True,
+        parent=None,
+        show_properties=False,
+        logger=Logger,
+        quant_threads=List[QuantizationThread],
+    ) -> None:
+        super().__init__(parent)
+        self.quant_threads = quant_threads
+        self.task_name = task_name
+        self.log_file = log_file
+        self.logger = logger
+        self.show_properties = show_properties
+        self.status = "Pending"
+        layout = QHBoxLayout(self)
+
+        self.task_label = QLabel(task_name)
+        self.progress_bar = QProgressBar()
+        self.progress_bar.setRange(0, 100)
+        self.status_label = QLabel(self.status)
+        layout.addWidget(self.task_label)
+        layout.addWidget(self.progress_bar)
+        layout.addWidget(self.status_label)
+
+        # Hide progress bar if show_progress_bar is False
+        self.progress_bar.setVisible(show_progress_bar)
+
+        # Use indeterminate progress bar if not showing percentage
+        if not show_progress_bar:
+            self.progress_bar.setRange(0, 0)
+
+        self.progress_timer = QTimer(self)
+        self.progress_timer.timeout.connect(self.update_progress)
+        self.progress_value = 0
+
+    def show_task_context_menu(self, position) -> None:
+        self.logger.debug(SHOWING_TASK_CONTEXT_MENU)
+        item = self.task_list.itemAt(position)
+        if item is not None:
+            context_menu = QMenu(self)
+
+            properties_action = QAction(PROPERTIES, self)
+            properties_action.triggered.connect(lambda: self.show_task_properties(item))
+            context_menu.addAction(properties_action)
+
+            task_item = self.task_list.itemWidget(item)
+            if task_item.status != COMPLETED:
+                cancel_action = QAction(CANCEL, self)
+                cancel_action.triggered.connect(lambda: self.cancel_task(item))
+                context_menu.addAction(cancel_action)
+
+            if task_item.status == CANCELED:
+                restart_action = QAction(RESTART, self)
+                restart_action.triggered.connect(lambda: self.restart_task(task_item))
+                context_menu.addAction(restart_action)
+
+            delete_action = QAction(DELETE, self)
+            delete_action.triggered.connect(lambda: self.delete_task(item))
+            context_menu.addAction(delete_action)
+
+            context_menu.exec(self.task_list.viewport().mapToGlobal(position))
+
+    def show_task_properties(self, item) -> None:
+        self.logger.debug(SHOWING_PROPERTIES_FOR_TASK.format(item.text()))
+        for thread in self.quant_threads:
+            model_info_dialog = ModelInfoDialog(thread.model_info, self)
+            model_info_dialog.exec()
+            break
+
+    def cancel_task(self, item) -> None:
+        # TODO: fix possibly buggy signal behavior
+        task_item = self.task_list.itemWidget(item)
+        if task_item:
+            task_name = task_item.task_name  # Store the name before any changes
+            self.logger.info(CANCELLING_TASK.format(task_name))
+
+            # Find the thread and disconnect signals before terminating
+            for thread in self.quant_threads:
+                if thread.log_file == task_item.log_file:
+                    # Disconnect all signals from this thread first
+                    try:
+                        thread.error_signal.disconnect()  # Disconnect all error signal connections
+                        thread.output_signal.disconnect()  # Disconnect all output signal connections
+                    except TypeError:
+                        # No connections to disconnect
+                        pass
+
+                    # Now terminate the thread
+                    thread.terminate()
+                    self.quant_threads.remove(thread)
+                    break
+
+    def delete_task(self, item) -> None:
+        task_item = self.task_list.itemWidget(item)
+        if not task_item:
+            return
+
+        task_name = task_item.task_name  # Store task_name before deletion
+        self.logger.info(DELETING_TASK.format(task_name))
+
+        reply = QMessageBox.question(
+            self,
+            CONFIRM_DELETION_TITLE,
+            CONFIRM_DELETION,
+            QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No,
+            QMessageBox.StandardButton.No,
+        )
+
+        if reply == QMessageBox.StandardButton.Yes:
+            # Cancel the task first (which disconnects signals)
+            self.cancel_task(item)
+
+            # Now remove from list and delete
+            row = self.task_list.row(item)
+            self.task_list.takeItem(row)
+
+            # Delete the widget after removing from list
+            task_item.deleteLater()
+
+    def update_status(self, status) -> None:
+        self.status = status
+        self.status_label.setText(status)
+        if status == IN_PROGRESS:
+            # Only start timer if showing percentage progress
+            if self.progress_bar.isVisible():
+                self.progress_bar.setRange(0, 100)
+                self.progress_timer.start(100)
+        elif status == COMPLETED:
+            self.progress_timer.stop()
+            self.progress_bar.setValue(100)
+        elif status == CANCELED:
+            self.progress_timer.stop()
+            self.progress_bar.setValue(0)
+
+    def set_error(self) -> None:
+        self.status = ERROR
+        self.status_label.setText(ERROR)
+        self.status_label.setStyleSheet("color: red;")
+        self.progress_bar.setRange(0, 100)
+        self.progress_timer.stop()
+
+    def update_progress(self, value=None) -> None:
+        if value is not None:
+            # Update progress bar with specific value
+            self.progress_value = value
+            self.progress_bar.setValue(self.progress_value)
+        else:
+            return
+
+    def restart_task(self, task_item) -> None:
+        self.logger.info(RESTARTING_TASK.format(task_item.task_name))
+        for thread in self.quant_threads:
+            if thread.log_file == task_item.log_file:
+                new_thread = QuantizationThread(
+                    thread.command, thread.cwd, thread.log_file
+                )
+                self.quant_threads.append(new_thread)
+                new_thread.status_signal.connect(task_item.update_status)
+                new_thread.finished_signal.connect(
+                    lambda: self.task_finished(new_thread, task_item)
+                )
+                new_thread.error_signal.connect(
+                    lambda err: handle_error(self.logger, err, task_item)
+                )
+                new_thread.model_info_signal.connect(self.update_model_info)
+                new_thread.start()
+                task_item.update_status(IN_PROGRESS)
+                break
--- a/src/convert_hf_to_gguf.py
+++ b/src/convert_hf_to_gguf.py
--- a/src/convert_lora_to_ggml.py
+++ b/src/convert_lora_to_ggml.py
@ -1,20 +1,17 @@
-#!/usr/bin/env python3
 from __future__ import annotations

-import logging
 import json
+import logging
 import os
 import struct
 import sys
-from pathlib import Path
-from typing import Any, BinaryIO, Sequence
+from typing import BinaryIO

 import numpy as np
 import torch

-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
+from gguf.constants import *
+from gguf.tensor_mapping import *

 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger("lora-to-gguf")
@ -35,7 +32,9 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(struct.pack("i", int(params["lora_alpha"])))


-def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
+def write_tensor_header(
+    fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]
+) -> None:
    sname = name.encode("utf-8")
    fout.write(
        struct.pack(
@ -49,20 +48,21 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
    fout.write(sname)
    fout.seek((fout.tell() + 31) & -32)

-def pyinstaller_include():
-    # PyInstaller import
-    pass

-if __name__ == '__main__':
+if __name__ == "__main__":
    if len(sys.argv) < 2:
-        logger.info(f"Usage: python {sys.argv[0]} <path> [arch]")
-        logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'")
-        logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
+        logger.info(f"Usage: python {sys.argv[0]} <path> <output_path> [arch]")
+        logger.info(
+            "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
+        )
+        logger.info(
+            f"Arch must be one of {list(MODEL_ARCH_NAMES.values())} (default: llama)"
+        )
        sys.exit(1)

    input_json = os.path.join(sys.argv[1], "adapter_config.json")
    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
-    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
+    output_path = sys.argv[2]

    if os.path.exists(input_model):
        model = torch.load(input_model, map_location="cpu")
@ -70,22 +70,27 @@ def pyinstaller_include():
        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
        # lazy import load_file only if lora is in safetensors format.
        from safetensors.torch import load_file
+
        model = load_file(input_model, device="cpu")

-    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
+    arch_name = sys.argv[3] if len(sys.argv) == 4 else "llama"

-    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
+    if arch_name not in MODEL_ARCH_NAMES.values():
        logger.error(f"Error: unsupported architecture {arch_name}")
        sys.exit(1)

-    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
-    name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
+    arch = list(MODEL_ARCH_NAMES.keys())[
+        list(MODEL_ARCH_NAMES.values()).index(arch_name)
+    ]
+    name_map = TensorNameMap(arch, 500)

    with open(input_json, "r") as f:
        params = json.load(f)

    if params["peft_type"] != "LORA":
-        logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
+        logger.error(
+            f"Error: unsupported adapter type {params['peft_type']}, expected LORA"
+        )
        sys.exit(1)

    if params["fan_in_fan_out"] is True:
@ -127,7 +132,7 @@ def pyinstaller_include():

            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
            if k.endswith(lora_suffixes):
-                suffix = k[-len(lora_suffixes[0]):]
+                suffix = k[-len(lora_suffixes[0]) :]
                k = k[: -len(lora_suffixes[0])]
            else:
                logger.error(f"Error: unrecognized tensor name {orig_k}")
@ -136,7 +141,9 @@ def pyinstaller_include():
            tname = name_map.get_name(k)
            if tname is None:
                logger.error(f"Error: could not map tensor name {orig_k}")
-                logger.error(" Note: the arch parameter must be specified if the model is not llama")
+                logger.error(
+                    " Note: the arch parameter must be specified if the model is not llama"
+                )
                sys.exit(1)

            if suffix == ".lora_A.weight":
@ -146,7 +153,9 @@ def pyinstaller_include():
            else:
                assert False

-            logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+            logger.info(
+                f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB"
+            )
            write_tensor_header(fout, tname, t.shape, t.dtype)
            t.tofile(fout)

--- a/src/convert_lora_to_gguf.py
+++ b/src/convert_lora_to_gguf.py
@ -1,6 +1,3 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
 from __future__ import annotations

 from dataclasses import dataclass
@ -21,18 +18,16 @@
    SupportsIndex,
    cast,
 )
+from transformers import AutoConfig

 import torch

 if TYPE_CHECKING:
    from torch import Tensor
-
-if "NO_LOCAL_GGUF" not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / "gguf-py"))
 import gguf

 # reuse model definitions from convert_hf_to_gguf.py
-from convert_hf_to_gguf import LazyTorchTensor, Model
+from convert_hf_to_gguf import LazyTorchTensor, ModelBase

 logger = logging.getLogger("lora-to-gguf")

@ -245,17 +240,15 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
    base_name = lora_tensor_name.replace("base_model.model.", "")
    base_name = base_name.replace(".lora_A.weight", ".weight")
    base_name = base_name.replace(".lora_B.weight", ".weight")
+    # models produced by mergekit-extract-lora have token embeddings in the adapter
+    base_name = base_name.replace(".lora_embedding_A", ".weight")
+    base_name = base_name.replace(".lora_embedding_B", ".weight")
    return base_name


-def pyinstaller_include():
-    # PyInstaller import
-    pass
-
-
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
-        description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file"
+        description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file"
    )
    parser.add_argument(
        "--outfile",
@ -292,18 +285,28 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument(
        "--base",
        type=Path,
-        required=True,
-        help="directory containing base model file",
+        help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
+    )
+    parser.add_argument(
+        "--base-model-id",
+        type=str,
+        help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
    )
    parser.add_argument(
        "lora_path",
        type=Path,
-        help="directory containing LoRA adapter file",
+        help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
    )

    return parser.parse_args()


+def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
+    # normally, adapter does not come with base model config, we need to load it from AutoConfig
+    config = AutoConfig.from_pretrained(hf_model_id)
+    return config.to_dict()
+
+
 if __name__ == "__main__":
    args = parse_args()
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
@ -318,8 +321,9 @@ def parse_args() -> argparse.Namespace:

    ftype = ftype_map[args.outtype]

-    dir_base_model: Path = args.base
+    dir_base_model: Path | None = args.base
    dir_lora: Path = args.lora_path
+    base_model_id: str | None = args.base_model_id
    lora_config = dir_lora / "adapter_config.json"
    input_model = dir_lora / "adapter_model.safetensors"

@ -338,12 +342,41 @@ def parse_args() -> argparse.Namespace:
        input_model = os.path.join(dir_lora, "adapter_model.bin")
        lora_model = torch.load(input_model, map_location="cpu", weights_only=True)

+    # load LoRA config
+    with open(lora_config, "r") as f:
+        lparams: dict[str, Any] = json.load(f)
+
    # load base model
-    logger.info(f"Loading base model: {dir_base_model.name}")
-    hparams = Model.load_hparams(dir_base_model)
+    if base_model_id is not None:
+        logger.info(f"Loading base model from Hugging Face: {base_model_id}")
+        hparams = load_hparams_from_hf(base_model_id)
+    elif dir_base_model is None:
+        if "base_model_name_or_path" in lparams:
+            model_id = lparams["base_model_name_or_path"]
+            logger.info(f"Loading base model from Hugging Face: {model_id}")
+            try:
+                hparams = load_hparams_from_hf(model_id)
+            except OSError as e:
+                logger.error(f"Failed to load base model config: {e}")
+                logger.error(
+                    "Please try downloading the base model and add its path to --base"
+                )
+                sys.exit(1)
+        else:
+            logger.error(
+                "'base_model_name_or_path' is not found in adapter_config.json"
+            )
+            logger.error(
+                "Base model config is required. Please download the base model and add its path to --base"
+            )
+            sys.exit(1)
+    else:
+        logger.info(f"Loading base model: {dir_base_model.name}")
+        hparams = ModelBase.load_hparams(dir_base_model)
+
    with torch.inference_mode():
        try:
-            model_class = Model.from_model_architecture(hparams["architectures"][0])
+            model_class = ModelBase.from_model_architecture(hparams["architectures"][0])
        except NotImplementedError:
            logger.error(f"Model {hparams['architectures'][0]} is not supported")
            sys.exit(1)
@ -362,6 +395,9 @@ def __init__(
                self.dir_model_card = dir_lora_model
                self.lora_alpha = float(lora_alpha)

+            def set_vocab(self):
+                pass
+
            def set_type(self):
                self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
                self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
@ -370,7 +406,10 @@ def set_gguf_parameters(self):
                self.gguf_writer.add_float32(
                    gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha
                )
-                super().set_gguf_parameters()
+
+            def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+                # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
+                return ()

            def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                tensor_map: dict[str, PartialLoraTensor] = {}
@ -379,14 +418,26 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                    if self.lazy:
                        tensor = LazyTorchTensor.from_eager(tensor)
                    base_name = get_base_tensor_name(name)
-                    is_lora_a = ".lora_A.weight" in name
-                    is_lora_b = ".lora_B.weight" in name
+                    # note: mergekit-extract-lora also adds token embeddings to the adapter
+                    is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
+                    is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
                    if not is_lora_a and not is_lora_b:
                        if ".base_layer.weight" in name:
                            continue
+                        # mergekit-extract-lora add these layernorm to the adapter, we need to keep them
+                        if "_layernorm" in name or ".norm" in name:
+                            yield (base_name, tensor)
+                            continue
                        logger.error(
                            f"Unexpected name '{name}': Not a lora_A or lora_B tensor"
                        )
+                        if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
+                            logger.error(
+                                "Embeddings is present in the adapter. This can be due to new tokens added during fine tuning"
+                            )
+                            logger.error(
+                                "Please refer to https://github.com/ggml-org/llama.cpp/pull/9948"
+                            )
                        sys.exit(1)

                    if base_name in tensor_map:
@ -411,17 +462,34 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
            def modify_tensors(
                self, data_torch: Tensor, name: str, bid: int | None
            ) -> Iterable[tuple[str, Tensor]]:
-                dest = super().modify_tensors(data_torch, name, bid)
+                dest = list(super().modify_tensors(data_torch, name, bid))
+                # some archs may have the same tensor for lm_head and output (tie word embeddings)
+                # in this case, adapters targeting lm_head will fail when using llama-export-lora
+                # therefore, we ignore them for now
+                # see: https://github.com/ggml-org/llama.cpp/issues/9065
+                if name == "lm_head.weight" and len(dest) == 0:
+                    raise ValueError(
+                        "lm_head is present in adapter, but is ignored in base model"
+                    )
                for dest_name, dest_data in dest:
+                    # mergekit-extract-lora add these layernorm to the adapter
+                    if "_norm" in dest_name:
+                        assert dest_data.dim() == 1
+                        yield (dest_name, dest_data)
+                        continue
+
+                    # otherwise, we must get the lora_A and lora_B tensors
                    assert isinstance(dest_data, LoraTorchTensor)
                    lora_a, lora_b = dest_data.get_lora_A_B()

+                    # note: mergekit-extract-lora flip and transpose A and B
+                    # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
+                    if "token_embd.weight" in dest_name:
+                        lora_a = lora_a.T
+
                    yield (dest_name + ".lora_a", lora_a)
                    yield (dest_name + ".lora_b", lora_b)

-        with open(lora_config, "r") as f:
-            lparams: dict[str, Any] = json.load(f)
-
        alpha: float = lparams["lora_alpha"]

        model_instance = LoraModel(
@ -434,6 +502,7 @@ def modify_tensors(
            dry_run=args.dry_run,
            dir_lora_model=dir_lora,
            lora_alpha=alpha,
+            hparams=hparams,
        )

        logger.info("Exporting model...")
--- a/src/dequantize_gguf.py
+++ b/src/dequantize_gguf.py
@ -0,0 +1,105 @@
+import argparse
+import json
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+from safetensors.torch import save_file
+
+import gguf
+
+
+def dequantize_tensor(tensor):
+    if tensor.tensor_type in [
+        gguf.GGMLQuantizationType.F32,
+        gguf.GGMLQuantizationType.F16,
+        gguf.GGMLQuantizationType.BF16,
+    ]:
+        return np.array(tensor.data)
+    else:
+        return tensor.data.astype(np.float32)
+
+
+def gguf_to_safetensors(gguf_path, safetensors_path, metadata_path=None):
+    try:
+        reader = gguf.GGUFReader(gguf_path)
+    except Exception as e:
+        print(f"Error reading GGUF file: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    tensors = {}
+    metadata = {}
+
+    for tensor in reader.tensors:
+        try:
+            dequantized_data = dequantize_tensor(tensor)
+            tensors[tensor.name] = torch.from_numpy(
+                dequantized_data.reshape(tuple(reversed(tensor.shape)))
+            )
+        except Exception as e:
+            print(f"Error processing tensor {tensor.name}: {e}", file=sys.stderr)
+            continue
+
+    for field_name, field in reader.fields.items():
+        if field.data:
+            metadata[field_name] = field.parts[field.data[0]].tolist()
+
+    try:
+        save_file(tensors, safetensors_path)
+    except Exception as e:
+        print(f"Error saving SafeTensors file: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    decoded_metadata = {}
+    for key, value in metadata.items():
+        if isinstance(value, list) and all(isinstance(item, int) for item in value):
+            decoded_value = ""
+            for item in value:
+                if 48 <= item <= 57:
+                    decoded_value += str(item - 48)
+                elif 32 <= item <= 126:
+                    decoded_value += chr(item)
+                else:
+                    decoded_value += str(item)
+            decoded_metadata[key] = decoded_value
+        else:
+            decoded_metadata[key] = value
+
+    if metadata_path:
+        try:
+            with open(metadata_path, "w") as f:
+                json.dump(decoded_metadata, f, indent=4)
+        except Exception as e:
+            print(f"Error saving metadata file: {e}", file=sys.stderr)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert GGUF to SafeTensors format")
+    parser.add_argument("gguf_path", type=str, help="Path to the input GGUF file")
+    parser.add_argument(
+        "safetensors_path", type=str, help="Path to save the SafeTensors file"
+    )
+    parser.add_argument(
+        "--metadata_path",
+        type=str,
+        help="Optional path to save metadata as a JSON file",
+    )
+
+    args = parser.parse_args()
+
+    gguf_path = Path(args.gguf_path)
+    safetensors_path = Path(args.safetensors_path)
+    metadata_path = Path(args.metadata_path) if args.metadata_path else None
+
+    if not gguf_path.exists():
+        print(f"Error: GGUF file '{gguf_path}' does not exist.", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Converting {gguf_path} to {safetensors_path}")
+    gguf_to_safetensors(gguf_path, safetensors_path, metadata_path)
+    print("Conversion complete.")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/error_handling.py
+++ b/src/error_handling.py
@ -0,0 +1,13 @@
+from PySide6.QtWidgets import QMessageBox
+from Localizations import ERROR_MESSAGE, ERROR, TASK_ERROR
+
+
+def show_error(logger, message) -> None:
+    logger.error(message)
+    QMessageBox.critical(None, ERROR, message)
+
+
+def handle_error(logger, error_message, task_item) -> None:
+    logger.error(TASK_ERROR.format(error_message))
+    show_error(logger, error_message)
+    task_item.update_status(ERROR)
--- a/src/gguf-py/gguf/constants.py
+++ b/src/gguf-py/gguf/constants.py
--- a/src/gguf-py/gguf/metadata.py
+++ b/src/gguf-py/gguf/metadata.py
@ -1,503 +0,0 @@
-from __future__ import annotations
-
-import re
-import json
-import yaml
-import logging
-from pathlib import Path
-from typing import Any, Literal, Optional
-from dataclasses import dataclass
-
-from .constants import Keys
-
-import gguf
-
-logger = logging.getLogger("metadata")
-
-
-@dataclass
-class Metadata:
-    # Authorship Metadata to be written to GGUF KV Store
-    name: Optional[str] = None
-    author: Optional[str] = None
-    version: Optional[str] = None
-    organization: Optional[str] = None
-    finetune: Optional[str] = None
-    basename: Optional[str] = None
-    description: Optional[str] = None
-    quantized_by: Optional[str] = None
-    size_label: Optional[str] = None
-    url: Optional[str] = None
-    doi: Optional[str] = None
-    uuid: Optional[str] = None
-    repo_url: Optional[str] = None
-    source_url: Optional[str] = None
-    source_doi: Optional[str] = None
-    source_uuid: Optional[str] = None
-    source_repo_url: Optional[str] = None
-    license: Optional[str] = None
-    license_name: Optional[str] = None
-    license_link: Optional[str] = None
-    base_models: Optional[list[dict]] = None
-    tags: Optional[list[str]] = None
-    languages: Optional[list[str]] = None
-    datasets: Optional[list[str]] = None
-
-    @staticmethod
-    def load(metadata_override_path: Optional[Path] = None, model_path: Optional[Path] = None, model_name: Optional[str] = None, total_params: int = 0) -> Metadata:
-        # This grabs as many contextual authorship metadata as possible from the model repository
-        # making any conversion as required to match the gguf kv store metadata format
-        # as well as giving users the ability to override any authorship metadata that may be incorrect
-
-        # Create a new Metadata instance
-        metadata = Metadata()
-
-        model_card = Metadata.load_model_card(model_path)
-        hf_params = Metadata.load_hf_parameters(model_path)
-        # TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
-
-        # heuristics
-        metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
-
-        # Metadata Override File Provided
-        # This is based on LLM_KV_NAMES mapping in llama.cpp
-        metadata_override = Metadata.load_metadata_override(metadata_override_path)
-
-        metadata.name            = metadata_override.get(Keys.General.NAME,            metadata.name)
-        metadata.author          = metadata_override.get(Keys.General.AUTHOR,          metadata.author)
-        metadata.version         = metadata_override.get(Keys.General.VERSION,         metadata.version)
-        metadata.organization    = metadata_override.get(Keys.General.ORGANIZATION,    metadata.organization)
-
-        metadata.finetune        = metadata_override.get(Keys.General.FINETUNE,        metadata.finetune)
-        metadata.basename        = metadata_override.get(Keys.General.BASENAME,        metadata.basename)
-
-        metadata.description     = metadata_override.get(Keys.General.DESCRIPTION,     metadata.description)
-        metadata.quantized_by    = metadata_override.get(Keys.General.QUANTIZED_BY,    metadata.quantized_by)
-
-        metadata.size_label      = metadata_override.get(Keys.General.SIZE_LABEL,      metadata.size_label)
-        metadata.license_name    = metadata_override.get(Keys.General.LICENSE_NAME,    metadata.license_name)
-        metadata.license_link    = metadata_override.get(Keys.General.LICENSE_LINK,    metadata.license_link)
-
-        metadata.url             = metadata_override.get(Keys.General.URL,             metadata.url)
-        metadata.doi             = metadata_override.get(Keys.General.DOI,             metadata.doi)
-        metadata.uuid            = metadata_override.get(Keys.General.UUID,            metadata.uuid)
-        metadata.repo_url        = metadata_override.get(Keys.General.REPO_URL,        metadata.repo_url)
-
-        metadata.source_url      = metadata_override.get(Keys.General.SOURCE_URL,      metadata.source_url)
-        metadata.source_doi      = metadata_override.get(Keys.General.SOURCE_DOI,      metadata.source_doi)
-        metadata.source_uuid     = metadata_override.get(Keys.General.SOURCE_UUID,     metadata.source_uuid)
-        metadata.source_repo_url = metadata_override.get(Keys.General.SOURCE_REPO_URL, metadata.source_repo_url)
-
-        # Base Models is received here as an array of models
-        metadata.base_models     = metadata_override.get("general.base_models",        metadata.base_models)
-
-        metadata.tags            = metadata_override.get(Keys.General.TAGS,            metadata.tags)
-        metadata.languages       = metadata_override.get(Keys.General.LANGUAGES,       metadata.languages)
-        metadata.datasets        = metadata_override.get(Keys.General.DATASETS,        metadata.datasets)
-
-        # Direct Metadata Override (via direct cli argument)
-        if model_name is not None:
-            metadata.name = model_name
-
-        return metadata
-
-    @staticmethod
-    def load_metadata_override(metadata_override_path: Optional[Path] = None) -> dict[str, Any]:
-        if metadata_override_path is None or not metadata_override_path.is_file():
-            return {}
-
-        with open(metadata_override_path, "r", encoding="utf-8") as f:
-            return json.load(f)
-
-    @staticmethod
-    def load_model_card(model_path: Optional[Path] = None) -> dict[str, Any]:
-        if model_path is None or not model_path.is_dir():
-            return {}
-
-        model_card_path = model_path / "README.md"
-
-        if not model_card_path.is_file():
-            return {}
-
-        # The model card metadata is assumed to always be in YAML
-        # ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473
-        with open(model_card_path, "r", encoding="utf-8") as f:
-            if f.readline() == "---\n":
-                raw = f.read().partition("---\n")[0]
-                data = yaml.safe_load(raw)
-                if isinstance(data, dict):
-                    return data
-                else:
-                    logger.error(f"while reading YAML model card frontmatter, data is {type(data)} instead of dict")
-                    return {}
-            else:
-                return {}
-
-    @staticmethod
-    def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]:
-        if model_path is None or not model_path.is_dir():
-            return {}
-
-        config_path = model_path / "config.json"
-
-        if not config_path.is_file():
-            return {}
-
-        with open(config_path, "r", encoding="utf-8") as f:
-            return json.load(f)
-
-    @staticmethod
-    def id_to_title(string):
-        # Convert capitalization into title form unless acronym or version number
-        return ' '.join([w.title() if w.islower() and not re.match(r'^(v\d+(?:\.\d+)*|\d.*)$', w) else w for w in string.strip().replace('-', ' ').split()])
-
-    @staticmethod
-    def get_model_id_components(model_id: Optional[str] = None, total_params: int = 0) -> tuple[str | None, str | None, str | None, str | None, str | None, str | None]:
-        # Huggingface often store model id as '<org>/<model name>'
-        # so let's parse it and apply some heuristics if possible for model name components
-
-        if model_id is None:
-            # model ID missing
-            return None, None, None, None, None, None
-
-        if ' ' in model_id:
-            # model ID is actually a normal human sentence
-            # which means its most likely a normal model name only
-            # not part of the hugging face naming standard, but whatever
-            return model_id, None, None, None, None, None
-
-        if '/' in model_id:
-            # model ID (huggingface style)
-            org_component, model_full_name_component = model_id.split('/', 1)
-        else:
-            # model ID but missing org components
-            org_component, model_full_name_component = None, model_id
-
-        # Check if we erroneously matched against './' or '../' etc...
-        if org_component is not None and org_component[0] == '.':
-            org_component = None
-
-        name_parts: list[str] = model_full_name_component.split('-')
-
-        # Remove empty parts
-        for i in reversed(range(len(name_parts))):
-            if len(name_parts[i]) == 0:
-                del name_parts[i]
-
-        name_types: list[
-            set[Literal["basename", "size_label", "finetune", "version", "type"]]
-        ] = [set() for _ in name_parts]
-
-        # Annotate the name
-        for i, part in enumerate(name_parts):
-            # Version
-            if re.fullmatch(r'(v|iter)?\d+([.]\d+)*', part, re.IGNORECASE):
-                name_types[i].add("version")
-            # Quant type (should not be there for base models, but still annotated)
-            elif re.fullmatch(r'i?q\d(_\w)*|b?fp?(16|32)', part, re.IGNORECASE):
-                name_types[i].add("type")
-                name_parts[i] = part.upper()
-            # Model size
-            elif i > 0 and re.fullmatch(r'(([A]|\d+[x])?\d+([._]\d+)?[KMBT][\d]?|small|mini|medium|large|x?xl)', part, re.IGNORECASE):
-                part = part.replace("_", ".")
-                # Handle weird bloom-7b1 notation
-                if part[-1].isdecimal():
-                    part = part[:-2] + "." + part[-1] + part[-2]
-                # Normalize the size suffixes
-                if len(part) > 1 and part[-2].isdecimal():
-                    if part[-1] in "kmbt":
-                        part = part[:-1] + part[-1].upper()
-                if total_params != 0:
-                    try:
-                        label_params = float(part[:-1]) * pow(1000, " KMBT".find(part[-1]))
-                        # Only use it as a size label if it's close or bigger than the model size
-                        # Note that LoRA adapters don't necessarily include all layers,
-                        # so this is why bigger label sizes are accepted.
-                        # Do not use the size label when it's smaller than 1/8 of the model size
-                        if (total_params < 0 and label_params < abs(total_params) // 8) or (
-                            # Check both directions when the current model isn't a LoRA adapter
-                            total_params > 0 and abs(label_params - total_params) > 7 * total_params // 8
-                        ):
-                            # Likely a context length
-                            name_types[i].add("finetune")
-                            # Lowercase the size when it's a context length
-                            part = part[:-1] + part[-1].lower()
-                    except ValueError:
-                        # Failed to convert the size label to float, use it anyway
-                        pass
-                if len(name_types[i]) == 0:
-                    name_types[i].add("size_label")
-                name_parts[i] = part
-            # Some easy to recognize finetune names
-            elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE):
-                if total_params < 0 and part.lower() == "lora":
-                    # ignore redundant "lora" in the finetune part when the output is a lora adapter
-                    name_types[i].add("type")
-                else:
-                    name_types[i].add("finetune")
-
-        # Ignore word-based size labels when there is at least a number-based one present
-        # TODO: should word-based size labels always be removed instead?
-        if any(c.isdecimal() for n, t in zip(name_parts, name_types) if "size_label" in t for c in n):
-            for n, t in zip(name_parts, name_types):
-                if "size_label" in t:
-                    if all(c.isalpha() for c in n):
-                        t.remove("size_label")
-
-        at_start = True
-        # Find the basename through the annotated name
-        for part, t in zip(name_parts, name_types):
-            if at_start and ((len(t) == 0 and part[0].isalpha()) or "version" in t):
-                t.add("basename")
-            else:
-                if at_start:
-                    at_start = False
-                if len(t) == 0:
-                    t.add("finetune")
-
-        # Remove the basename annotation from trailing version
-        for part, t in zip(reversed(name_parts), reversed(name_types)):
-            if "basename" in t and len(t) > 1:
-                t.remove("basename")
-            else:
-                break
-
-        basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None
-        # Deduplicate size labels using order-preserving 'dict' ('set' seems to sort the keys)
-        size_label = "-".join(dict.fromkeys(s for s, t in zip(name_parts, name_types) if "size_label" in t).keys()) or None
-        finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None
-        # TODO: should the basename version always be excluded?
-        # NOTE: multiple finetune versions are joined together
-        version = "-".join(v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t) or None
-
-        if size_label is None and finetune is None and version is None:
-            # Too ambiguous, output nothing
-            basename = None
-
-        return model_full_name_component, org_component, basename, finetune, version, size_label
-
-    @staticmethod
-    def apply_metadata_heuristic(metadata: Metadata, model_card: Optional[dict] = None, hf_params: Optional[dict] = None, model_path: Optional[Path] = None, total_params: int = 0) -> Metadata:
-        # Reference Model Card Metadata: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
-
-        # Model Card Heuristics
-        ########################
-        if model_card is not None:
-
-            if "model_name" in model_card and metadata.name is None:
-                # Not part of huggingface model card standard but notice some model creator using it
-                # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
-                metadata.name = model_card.get("model_name")
-
-            if "model_creator" in model_card and metadata.author is None:
-                # Not part of huggingface model card standard but notice some model creator using it
-                # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
-                metadata.author = model_card.get("model_creator")
-
-            if "model_type" in model_card and metadata.basename is None:
-                # Not part of huggingface model card standard but notice some model creator using it
-                # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
-                metadata.basename = model_card.get("model_type")
-
-            if "base_model" in model_card:
-                # This represents the parent models that this is based on
-                # Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges)
-                # Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md
-                metadata_base_models = []
-                base_model_value = model_card.get("base_model", None)
-
-                if base_model_value is not None:
-                    if isinstance(base_model_value, str):
-                        metadata_base_models.append(base_model_value)
-                    elif isinstance(base_model_value, list):
-                        metadata_base_models.extend(base_model_value)
-
-                if metadata.base_models is None:
-                    metadata.base_models = []
-
-                for model_id in metadata_base_models:
-                    # NOTE: model size of base model is assumed to be similar to the size of the current model
-                    model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
-                    base_model = {}
-                    if model_full_name_component is not None:
-                        base_model["name"] = Metadata.id_to_title(model_full_name_component)
-                    if org_component is not None:
-                        base_model["organization"] = Metadata.id_to_title(org_component)
-                    if version is not None:
-                        base_model["version"] = version
-                    if org_component is not None and model_full_name_component is not None:
-                        base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}"
-                    metadata.base_models.append(base_model)
-
-            if "license" in model_card and metadata.license is None:
-                metadata.license = model_card.get("license")
-
-            if "license_name" in model_card and metadata.license_name is None:
-                metadata.license_name = model_card.get("license_name")
-
-            if "license_link" in model_card and metadata.license_link is None:
-                metadata.license_link = model_card.get("license_link")
-
-            tags_value = model_card.get("tags", None)
-            if tags_value is not None:
-
-                if metadata.tags is None:
-                    metadata.tags = []
-
-                if isinstance(tags_value, str):
-                    metadata.tags.append(tags_value)
-                elif isinstance(tags_value, list):
-                    metadata.tags.extend(tags_value)
-
-            pipeline_tags_value = model_card.get("pipeline_tag", None)
-            if pipeline_tags_value is not None:
-
-                if metadata.tags is None:
-                    metadata.tags = []
-
-                if isinstance(pipeline_tags_value, str):
-                    metadata.tags.append(pipeline_tags_value)
-                elif isinstance(pipeline_tags_value, list):
-                    metadata.tags.extend(pipeline_tags_value)
-
-            language_value = model_card.get("languages", model_card.get("language", None))
-            if language_value is not None:
-
-                if metadata.languages is None:
-                    metadata.languages = []
-
-                if isinstance(language_value, str):
-                    metadata.languages.append(language_value)
-                elif isinstance(language_value, list):
-                    metadata.languages.extend(language_value)
-
-            dataset_value = model_card.get("datasets", model_card.get("dataset", None))
-            if dataset_value is not None:
-
-                if metadata.datasets is None:
-                    metadata.datasets = []
-
-                if isinstance(dataset_value, str):
-                    metadata.datasets.append(dataset_value)
-                elif isinstance(dataset_value, list):
-                    metadata.datasets.extend(dataset_value)
-
-        # Hugging Face Parameter Heuristics
-        ####################################
-
-        if hf_params is not None:
-
-            hf_name_or_path = hf_params.get("_name_or_path")
-            if hf_name_or_path is not None and hf_name_or_path.count('/') <= 1:
-                # Use _name_or_path only if its actually a model name and not some computer path
-                # e.g. 'meta-llama/Llama-2-7b-hf'
-                model_id = hf_name_or_path
-                model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
-                if metadata.name is None and model_full_name_component is not None:
-                    metadata.name = Metadata.id_to_title(model_full_name_component)
-                if metadata.organization is None and org_component is not None:
-                    metadata.organization = Metadata.id_to_title(org_component)
-                if metadata.basename is None and basename is not None:
-                    metadata.basename = basename
-                if metadata.finetune is None and finetune is not None:
-                    metadata.finetune = finetune
-                if metadata.version is None and version is not None:
-                    metadata.version = version
-                if metadata.size_label is None and size_label is not None:
-                    metadata.size_label = size_label
-
-        # Directory Folder Name Fallback Heuristics
-        ############################################
-        if model_path is not None:
-            model_id = model_path.name
-            model_full_name_component, org_component, basename, finetune, version, size_label = Metadata.get_model_id_components(model_id, total_params)
-            if metadata.name is None and model_full_name_component is not None:
-                metadata.name = Metadata.id_to_title(model_full_name_component)
-            if metadata.organization is None and org_component is not None:
-                metadata.organization = Metadata.id_to_title(org_component)
-            if metadata.basename is None and basename is not None:
-                metadata.basename = basename
-            if metadata.finetune is None and finetune is not None:
-                metadata.finetune = finetune
-            if metadata.version is None and version is not None:
-                metadata.version = version
-            if metadata.size_label is None and size_label is not None:
-                metadata.size_label = size_label
-
-        return metadata
-
-    def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
-        assert self.name is not None
-        gguf_writer.add_name(self.name)
-
-        if self.author is not None:
-            gguf_writer.add_author(self.author)
-        if self.version is not None:
-            gguf_writer.add_version(self.version)
-        if self.organization is not None:
-            gguf_writer.add_organization(self.organization)
-
-        if self.finetune is not None:
-            gguf_writer.add_finetune(self.finetune)
-        if self.basename is not None:
-            gguf_writer.add_basename(self.basename)
-
-        if self.description is not None:
-            gguf_writer.add_description(self.description)
-        if self.quantized_by is not None:
-            gguf_writer.add_quantized_by(self.quantized_by)
-
-        if self.size_label is not None:
-            gguf_writer.add_size_label(self.size_label)
-
-        if self.license is not None:
-            gguf_writer.add_license(self.license)
-        if self.license_name is not None:
-            gguf_writer.add_license_name(self.license_name)
-        if self.license_link is not None:
-            gguf_writer.add_license_link(self.license_link)
-
-        if self.url is not None:
-            gguf_writer.add_url(self.url)
-        if self.doi is not None:
-            gguf_writer.add_doi(self.doi)
-        if self.uuid is not None:
-            gguf_writer.add_uuid(self.uuid)
-        if self.repo_url is not None:
-            gguf_writer.add_repo_url(self.repo_url)
-
-        if self.source_url is not None:
-            gguf_writer.add_source_url(self.source_url)
-        if self.source_doi is not None:
-            gguf_writer.add_source_doi(self.source_doi)
-        if self.source_uuid is not None:
-            gguf_writer.add_source_uuid(self.source_uuid)
-        if self.source_repo_url is not None:
-            gguf_writer.add_source_repo_url(self.source_repo_url)
-
-        if self.base_models is not None:
-            gguf_writer.add_base_model_count(len(self.base_models))
-            for key, base_model_entry in enumerate(self.base_models):
-                if "name" in base_model_entry:
-                    gguf_writer.add_base_model_name(key, base_model_entry["name"])
-                if "author" in base_model_entry:
-                    gguf_writer.add_base_model_author(key, base_model_entry["author"])
-                if "version" in base_model_entry:
-                    gguf_writer.add_base_model_version(key, base_model_entry["version"])
-                if "organization" in base_model_entry:
-                    gguf_writer.add_base_model_organization(key, base_model_entry["organization"])
-                if "url" in base_model_entry:
-                    gguf_writer.add_base_model_url(key, base_model_entry["url"])
-                if "doi" in base_model_entry:
-                    gguf_writer.add_base_model_doi(key, base_model_entry["doi"])
-                if "uuid" in base_model_entry:
-                    gguf_writer.add_base_model_uuid(key, base_model_entry["uuid"])
-                if "repo_url" in base_model_entry:
-                    gguf_writer.add_base_model_repo_url(key, base_model_entry["repo_url"])
-
-        if self.tags is not None:
-            gguf_writer.add_tags(self.tags)
-        if self.languages is not None:
-            gguf_writer.add_languages(self.languages)
-        if self.datasets is not None:
-            gguf_writer.add_datasets(self.datasets)
--- a/src/gguf-py/gguf/quants.py
+++ b/src/gguf-py/gguf/quants.py
@ -1,121 +0,0 @@
-from __future__ import annotations
-from typing import Callable, Sequence
-
-from numpy.typing import DTypeLike
-
-from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
-from .lazy import LazyNumpyTensor
-
-import numpy as np
-
-
-def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
-    block_size, type_size = GGML_QUANT_SIZES[quant_type]
-    if shape[-1] % block_size != 0:
-        raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
-    return (*shape[:-1], shape[-1] // block_size * type_size)
-
-
-def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
-    block_size, type_size = GGML_QUANT_SIZES[quant_type]
-    if shape[-1] % type_size != 0:
-        raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
-    return (*shape[:-1], shape[-1] // type_size * block_size)
-
-
-# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
-def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
-    n = n.astype(np.float32, copy=False).view(np.uint32)
-    # force nan to quiet
-    n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
-    # round to nearest even
-    n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
-    return n.astype(np.uint16)
-
-
-# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
-def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
-    rows = arr.reshape((-1, arr.shape[-1]))
-    osize = 1
-    for dim in oshape:
-        osize *= dim
-    out = np.empty(shape=osize, dtype=otype)
-    # compute over groups of 16 rows (arbitrary, but seems good for performance)
-    n_groups = (rows.shape[0] // 16) or 1
-    np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
-    return out.reshape(oshape)
-
-
-def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
-    return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape)
-
-
-__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16)
-
-
-def quantize_bf16(n: np.ndarray):
-    if type(n) is LazyNumpyTensor:
-        return __quantize_bf16_lazy(n)
-    else:
-        return __quantize_bf16_array(n)
-
-
-__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
-
-
-def can_quantize_to_q8_0(n: np.ndarray) -> bool:
-    return n.shape[-1] % __q8_block_size == 0
-
-
-# round away from zero
-# ref: https://stackoverflow.com/a/59143326/22827863
-def np_roundf(n: np.ndarray) -> np.ndarray:
-    a = abs(n)
-    floored = np.floor(a)
-    b = floored + np.floor(2 * (a - floored))
-    return np.sign(n) * b
-
-
-def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
-    return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
-
-
-# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
-def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
-    shape = n.shape
-    assert shape[-1] % __q8_block_size == 0
-
-    n_blocks = n.size // __q8_block_size
-
-    blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
-
-    d = abs(blocks).max(axis=1, keepdims=True) / 127
-    with np.errstate(divide="ignore"):
-        id = np.where(d == 0, 0, 1 / d)
-    qs = np_roundf(blocks * id)
-
-    # (n_blocks, 2)
-    d = d.astype(np.float16).view(np.uint8)
-    # (n_blocks, block_size)
-    qs = qs.astype(np.int8).view(np.uint8)
-
-    assert d.shape[1] + qs.shape[1] == __q8_type_size
-
-    return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
-
-
-def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
-    return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
-
-
-__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
-    __quantize_q8_0_array,
-    meta_noop=(np.uint8, __quantize_q8_0_shape_change),
-)
-
-
-def quantize_q8_0(data: np.ndarray):
-    if type(data) is LazyNumpyTensor:
-        return __quantize_q8_0_lazy(data)
-    else:
-        return __quantize_q8_0_array(data)
--- a/src/gguf-py/gguf/tensor_mapping.py
+++ b/src/gguf-py/gguf/tensor_mapping.py
@ -1,649 +0,0 @@
-from __future__ import annotations
-
-from typing import Sequence
-
-from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES
-
-
-class TensorNameMap:
-    mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
-        # Token embeddings
-        MODEL_TENSOR.TOKEN_EMBD: (
-            "gpt_neox.embed_in",                         # gptneox
-            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx jais
-            "transformer.word_embeddings",               # falcon
-            "word_embeddings",                           # bloom
-            "model.embed_tokens",                        # llama-hf
-            "tok_embeddings",                            # llama-pth
-            "embeddings.word_embeddings",                # bert nomic-bert
-            "language_model.embedding.word_embeddings",  # persimmon
-            "wte",                                       # gpt2
-            "transformer.embd.wte",                      # phi2
-            "model.tok_embeddings",                      # internlm2
-            "model.embedding",                           # mamba-qbert
-            "backbone.embedding",                        # mamba
-            "backbone.embeddings",                       # mamba-hf
-            "transformer.in_out_embed",                  # Grok
-            "embedding.word_embeddings",                 # chatglm
-            "transformer.token_embeddings",              # openelm
-            "shared",                                    # t5
-        ),
-
-        # Token type embeddings
-        MODEL_TENSOR.TOKEN_TYPES: (
-            "embeddings.token_type_embeddings",  # bert nomic-bert
-        ),
-
-        # Normalization of token embeddings
-        MODEL_TENSOR.TOKEN_EMBD_NORM: (
-            "word_embeddings_layernorm",  # bloom
-            "embeddings.LayerNorm",       # bert
-            "emb_ln",                     # nomic-bert
-            "transformer.norm",           # openelm
-        ),
-
-        # Position embeddings
-        MODEL_TENSOR.POS_EMBD: (
-            "transformer.wpe",                 # gpt2
-            "embeddings.position_embeddings",  # bert
-            "wpe",                             # gpt2
-        ),
-
-        # Output
-        MODEL_TENSOR.OUTPUT: (
-            "embed_out",                 # gptneox
-            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais
-            "output",                    # llama-pth bloom internlm2
-            "word_embeddings_for_head",  # persimmon
-            "lm_head.linear",            # phi2
-            "output_layer",              # chatglm
-        ),
-
-        # Output norm
-        MODEL_TENSOR.OUTPUT_NORM: (
-            "gpt_neox.final_layer_norm",               # gptneox
-            "transformer.ln_f",                        # gpt2 gpt-j falcon jais
-            "model.norm",                              # llama-hf baichuan internlm2
-            "norm",                                    # llama-pth
-            "transformer.norm_f",                      # mpt dbrx
-            "ln_f",                                    # refact bloom qwen gpt2
-            "language_model.encoder.final_layernorm",  # persimmon
-            "model.final_layernorm",                   # persimmon
-            "lm_head.ln",                              # phi2
-            "model.norm_f",                            # mamba-qbert
-            "backbone.norm_f",                         # mamba
-            "transformer.rms_norm",                    # Grok
-            "encoder.final_layernorm",                 # chatglm
-            "transformer.norm",                        # openelm
-        ),
-
-        # Rope frequencies
-        MODEL_TENSOR.ROPE_FREQS: (
-            "rope.freqs",  # llama-pth
-            "rotary_pos_emb.inv_freq",  # chatglm
-        ),
-    }
-
-    block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
-        # Attention norm
-        MODEL_TENSOR.ATTN_NORM: (
-            "gpt_neox.layers.{bid}.input_layernorm",                # gptneox
-            "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact qwen jais
-            "transformer.blocks.{bid}.norm_1",                      # mpt
-            "transformer.h.{bid}.input_layernorm",                  # falcon7b
-            "h.{bid}.input_layernorm",                              # bloom
-            "transformer.h.{bid}.ln_mlp",                           # falcon40b
-            "model.layers.{bid}.input_layernorm",                   # llama-hf
-            "layers.{bid}.attention_norm",                          # llama-pth
-            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
-            "model.layers.{bid}.ln1",                               # yi
-            "h.{bid}.ln_1",                                         # gpt2
-            "transformer.h.{bid}.ln",                               # phi2
-            "model.layers.layers.{bid}.norm",                       # plamo
-            "model.layers.{bid}.attention_norm",                    # internlm2
-            "model.layers.{bid}.norm",                              # mamba-qbert
-            "backbone.layers.{bid}.norm",                           # mamba
-            "transformer.decoder_layer.{bid}.rms_norm",             # Grok
-            "transformer.blocks.{bid}.norm_attn_norm.norm_1",       # dbrx
-            "encoder.layers.{bid}.input_layernorm",                 # chatglm
-            "transformer.layers.{bid}.attn_norm",                   # openelm
-        ),
-
-        # Attention norm 2
-        MODEL_TENSOR.ATTN_NORM_2: (
-            "transformer.h.{bid}.ln_attn",  # falcon40b
-            "encoder.layer.{bid}.layer_norm_1",             # jina-v2-code
-        ),
-
-        # Attention query-key-value
-        MODEL_TENSOR.ATTN_QKV: (
-            "gpt_neox.layers.{bid}.attention.query_key_value",                     # gptneox
-            "transformer.h.{bid}.attn.c_attn",                                     # gpt2 qwen jais
-            "transformer.blocks.{bid}.attn.Wqkv",                                  # mpt
-            "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv",                   # dbrx
-            "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
-            "h.{bid}.self_attention.query_key_value",                              # bloom
-            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
-            "model.layers.{bid}.self_attn.query_key_value",                        # persimmon
-            "h.{bid}.attn.c_attn",                                                 # gpt2
-            "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
-            "encoder.layers.{bid}.attn.Wqkv",                                      # nomic-bert
-            "model.layers.{bid}.self_attn.qkv_proj",                               # phi3
-            "encoder.layers.{bid}.self_attention.query_key_value",                 # chatglm
-            "transformer.layers.{bid}.attn.qkv_proj",                              # openelm
-        ),
-
-        # Attention query
-        MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf
-            "layers.{bid}.attention.wq",                                 # llama-pth
-            "encoder.layer.{bid}.attention.self.query",                  # bert
-            "transformer.h.{bid}.attn.q_proj",                           # gpt-j
-            "model.layers.layers.{bid}.self_attn.q_proj",                # plamo
-            "model.layers.{bid}.attention.wq",                           # internlm2
-            "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
-        ),
-
-        # Attention key
-        MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf
-            "layers.{bid}.attention.wk",                               # llama-pth
-            "encoder.layer.{bid}.attention.self.key",                  # bert
-            "transformer.h.{bid}.attn.k_proj",                         # gpt-j
-            "transformer.h.{bid}.attn.k",                              # refact
-            "model.layers.layers.{bid}.self_attn.k_proj",              # plamo
-            "model.layers.{bid}.attention.wk",                         # internlm2
-            "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
-        ),
-
-        # Attention value
-        MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf
-            "layers.{bid}.attention.wv",                                 # llama-pth
-            "encoder.layer.{bid}.attention.self.value",                  # bert
-            "transformer.h.{bid}.attn.v_proj",                           # gpt-j
-            "transformer.h.{bid}.attn.v",                                # refact
-            "model.layers.layers.{bid}.self_attn.v_proj",                # plamo
-            "model.layers.{bid}.attention.wv",                           # internlm2
-            "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
-        ),
-
-        # Attention output
-        MODEL_TENSOR.ATTN_OUT: (
-            "gpt_neox.layers.{bid}.attention.dense",                        # gptneox
-            "transformer.h.{bid}.attn.c_proj",                              # gpt2 refact qwen jais
-            "transformer.blocks.{bid}.attn.out_proj",                       # mpt
-            "transformer.h.{bid}.self_attention.dense",                     # falcon
-            "h.{bid}.self_attention.dense",                                 # bloom
-            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf
-            "layers.{bid}.attention.wo",                                    # llama-pth
-            "encoder.layer.{bid}.attention.output.dense",                   # bert
-            "transformer.h.{bid}.attn.out_proj",                            # gpt-j
-            "language_model.encoder.layers.{bid}.self_attention.dense",     # persimmon
-            "model.layers.{bid}.self_attn.dense",                           # persimmon
-            "h.{bid}.attn.c_proj",                                          # gpt2
-            "transformer.h.{bid}.mixer.out_proj",                           # phi2
-            "model.layers.layers.{bid}.self_attn.o_proj",                   # plamo
-            "model.layers.{bid}.attention.wo",                              # internlm2
-            "encoder.layers.{bid}.attn.out_proj",                           # nomic-bert
-            "transformer.decoder_layer.{bid}.multi_head_attention.linear",  # Grok
-            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj",        # dbrx
-            "encoder.layers.{bid}.self_attention.dense",                    # chatglm
-            "transformer.layers.{bid}.attn.out_proj",                       # openelm
-        ),
-
-        # Attention output norm
-        MODEL_TENSOR.ATTN_OUT_NORM: (
-            "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
-            "encoder.layers.{bid}.norm1",                      # nomic-bert
-            "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
-            "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
-        ),
-
-        MODEL_TENSOR.ATTN_POST_NORM: (
-            "model.layers.{bid}.post_attention_layernorm",     # gemma2
-        ),
-
-        # Rotary embeddings
-        MODEL_TENSOR.ATTN_ROT_EMBD: (
-            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",        # llama-hf
-            "layers.{bid}.attention.inner_attention.rope.freqs",       # llama-pth
-            "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
-            "transformer.h.{bid}.attn.rotary_emb.inv_freq",            # codeshell
-        ),
-
-        # Feed-forward norm
-        MODEL_TENSOR.FFN_NORM: (
-            "gpt_neox.layers.{bid}.post_attention_layernorm",                # gptneox
-            "transformer.h.{bid}.ln_2",                                      # gpt2 refact qwen jais
-            "h.{bid}.post_attention_layernorm",                              # bloom
-            "transformer.blocks.{bid}.norm_2",                               # mpt
-            "model.layers.{bid}.post_attention_layernorm",                   # llama-hf
-            "layers.{bid}.ffn_norm",                                         # llama-pth
-            "language_model.encoder.layers.{bid}.post_attention_layernorm",  # persimmon
-            "model.layers.{bid}.ln2",                                        # yi
-            "h.{bid}.ln_2",                                                  # gpt2
-            "model.layers.{bid}.ffn_norm",                                   # internlm2
-            "transformer.decoder_layer.{bid}.rms_norm_2",                    # Grok
-            "encoder.layers.{bid}.post_attention_layernorm",                 # chatglm
-            "transformer.layers.{bid}.ffn_norm",                             # openelm
-        ),
-
-        # Post feed-forward norm
-        MODEL_TENSOR.FFN_PRE_NORM: (
-            "model.layers.{bid}.pre_feedforward_layernorm", # gemma2
-        ),
-
-        # Post feed-forward norm
-        MODEL_TENSOR.FFN_POST_NORM: (
-            "model.layers.{bid}.post_feedforward_layernorm", # gemma2
-        ),
-
-        MODEL_TENSOR.FFN_GATE_INP: (
-            "layers.{bid}.feed_forward.gate",             # mixtral
-            "model.layers.{bid}.block_sparse_moe.gate",   # mixtral
-            "model.layers.{bid}.mlp.gate",                # qwen2moe
-            "transformer.decoder_layer.{bid}.router",     # Grok
-            "transformer.blocks.{bid}.ffn.router.layer",  # dbrx
-        ),
-
-        MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
-            "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
-        ),
-
-        # Feed-forward up
-        MODEL_TENSOR.FFN_UP: (
-            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",                # gptneox
-            "transformer.h.{bid}.mlp.c_fc",                           # gpt2 jais
-            "transformer.blocks.{bid}.ffn.up_proj",                   # mpt
-            "transformer.h.{bid}.mlp.dense_h_to_4h",                  # falcon
-            "h.{bid}.mlp.dense_h_to_4h",                              # bloom
-            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact
-            "layers.{bid}.feed_forward.w3",                           # llama-pth
-            "encoder.layer.{bid}.intermediate.dense",                 # bert
-            "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
-            "transformer.h.{bid}.mlp.linear_3",                       # refact
-            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
-            "model.layers.{bid}.mlp.dense_h_to_4h",                   # persimmon
-            "transformer.h.{bid}.mlp.w1",                             # qwen
-            "h.{bid}.mlp.c_fc",                                       # gpt2
-            "transformer.h.{bid}.mlp.fc1",                            # phi2
-            "model.layers.{bid}.mlp.fc1",                             # phi2
-            "model.layers.{bid}.mlp.gate_up_proj",                    # phi3
-            "model.layers.layers.{bid}.mlp.up_proj",                  # plamo
-            "model.layers.{bid}.feed_forward.w3",                     # internlm2
-            "encoder.layers.{bid}.mlp.fc11",                          # nomic-bert
-            "model.layers.{bid}.mlp.c_fc",                            # starcoder2
-            "encoder.layer.{bid}.mlp.gated_layers_v",                 # jina-bert-v2
-            "model.layers.{bid}.residual_mlp.w3",                     # arctic
-            "encoder.layers.{bid}.mlp.dense_h_to_4h",                 # chatglm
-        ),
-
-        MODEL_TENSOR.FFN_UP_EXP: (
-            "layers.{bid}.feed_forward.experts.w3",          # mixtral (merged)
-            "transformer.decoder_layer.{bid}.moe.linear_v",  # Grok (merged)
-            "transformer.blocks.{bid}.ffn.experts.mlp.v1",   # dbrx
-            "model.layers.{bid}.mlp.experts.up_proj",        # qwen2moe (merged)
-        ),
-
-        MODEL_TENSOR.FFN_UP_SHEXP: (
-            "model.layers.{bid}.mlp.shared_expert.up_proj",  # qwen2moe
-            "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2
-        ),
-
-        # AWQ-activation gate
-        MODEL_TENSOR.FFN_ACT: (
-            "transformer.blocks.{bid}.ffn.act",  # mpt
-        ),
-
-        # Feed-forward gate
-        MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact
-            "layers.{bid}.feed_forward.w1",               # llama-pth
-            "transformer.h.{bid}.mlp.w2",                 # qwen
-            "transformer.h.{bid}.mlp.c_fc2",              # jais
-            "model.layers.layers.{bid}.mlp.gate_proj",    # plamo
-            "model.layers.{bid}.feed_forward.w1",         # internlm2
-            "encoder.layers.{bid}.mlp.fc12",              # nomic-bert
-            "encoder.layer.{bid}.mlp.gated_layers_w",     # jina-bert-v2
-            "transformer.h.{bid}.mlp.linear_1",           # refact
-            "model.layers.{bid}.residual_mlp.w1",         # arctic
-        ),
-
-        MODEL_TENSOR.FFN_GATE_EXP: (
-            "layers.{bid}.feed_forward.experts.w1",         # mixtral (merged)
-            "transformer.decoder_layer.{bid}.moe.linear",   # Grok (merged)
-            "transformer.blocks.{bid}.ffn.experts.mlp.w1",  # dbrx
-            "model.layers.{bid}.mlp.experts.gate_proj",     # qwen2moe (merged)
-        ),
-
-        MODEL_TENSOR.FFN_GATE_SHEXP: (
-            "model.layers.{bid}.mlp.shared_expert.gate_proj",  # qwen2moe
-            "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2
-        ),
-
-        # Feed-forward down
-        MODEL_TENSOR.FFN_DOWN: (
-            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",                # gptneox
-            "transformer.h.{bid}.mlp.c_proj",                         # gpt2 refact qwen jais
-            "transformer.blocks.{bid}.ffn.down_proj",                 # mpt
-            "transformer.h.{bid}.mlp.dense_4h_to_h",                  # falcon
-            "h.{bid}.mlp.dense_4h_to_h",                              # bloom
-            "model.layers.{bid}.mlp.down_proj",                       # llama-hf
-            "layers.{bid}.feed_forward.w2",                           # llama-pth
-            "encoder.layer.{bid}.output.dense",                       # bert
-            "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
-            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
-            "model.layers.{bid}.mlp.dense_4h_to_h",                   # persimmon
-            "h.{bid}.mlp.c_proj",                                     # gpt2
-            "transformer.h.{bid}.mlp.fc2",                            # phi2
-            "model.layers.{bid}.mlp.fc2",                             # phi2
-            "model.layers.layers.{bid}.mlp.down_proj",                # plamo
-            "model.layers.{bid}.feed_forward.w2",                     # internlm2
-            "encoder.layers.{bid}.mlp.fc2",                           # nomic-bert
-            "model.layers.{bid}.mlp.c_proj",                          # starcoder2
-            "encoder.layer.{bid}.mlp.wo",                             # jina-bert-v2
-            "transformer.layers.{bid}.ffn.proj_2",                    # openelm
-            "model.layers.{bid}.residual_mlp.w2",                     # arctic
-            "encoder.layer.{bid}.mlp.down_layer",                     # jina-bert-v2
-            "encoder.layers.{bid}.mlp.dense_4h_to_h",                 # chatglm
-        ),
-
-        MODEL_TENSOR.FFN_DOWN_EXP: (
-            "layers.{bid}.feed_forward.experts.w2",          # mixtral (merged)
-            "transformer.decoder_layer.{bid}.moe.linear_1",  # Grok (merged)
-            "transformer.blocks.{bid}.ffn.experts.mlp.w2",   # dbrx
-            "model.layers.{bid}.mlp.experts.down_proj",      # qwen2moe (merged)
-        ),
-
-        MODEL_TENSOR.FFN_DOWN_SHEXP: (
-            "model.layers.{bid}.mlp.shared_expert.down_proj",  # qwen2moe
-            "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2
-        ),
-
-        MODEL_TENSOR.ATTN_Q_NORM: (
-            "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
-            "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.q_norm",                            # cohere
-            "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion
-            "encoder.layer.{bid}.attention.self.layer_norm_q",                # jina-bert-v2
-            "transformer.layers.{bid}.attn.q_norm",                           # openelm
-        ),
-
-        MODEL_TENSOR.ATTN_K_NORM: (
-            "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
-            "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.k_norm",                            # cohere
-            "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion
-            "encoder.layer.{bid}.attention.self.layer_norm_k",                # jina-bert-v2
-            "transformer.layers.{bid}.attn.k_norm",                           # openelm
-        ),
-
-        MODEL_TENSOR.ROPE_FREQS: (
-            "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq",  # persimmon
-        ),
-
-        MODEL_TENSOR.LAYER_OUT_NORM: (
-            "encoder.layer.{bid}.output.LayerNorm",         # bert
-            "encoder.layers.{bid}.norm2",                   # nomic-bert
-            "transformer.decoder_layer.{bid}.rms_norm_3",   # Grok
-            "encoder.layer.{bid}.mlp.layernorm",            # jina-bert-v2
-            "encoder.layer.{bid}.layer_norm_2"              # jina-v2-code
-        ),
-
-        MODEL_TENSOR.SSM_IN: (
-            "model.layers.{bid}.in_proj",
-            "backbone.layers.{bid}.mixer.in_proj",
-        ),
-
-        MODEL_TENSOR.SSM_CONV1D: (
-            "model.layers.{bid}.conv1d",
-            "backbone.layers.{bid}.mixer.conv1d",
-        ),
-
-        MODEL_TENSOR.SSM_X: (
-            "model.layers.{bid}.x_proj",
-            "backbone.layers.{bid}.mixer.x_proj",
-        ),
-
-        MODEL_TENSOR.SSM_DT: (
-            "model.layers.{bid}.dt_proj",
-            "backbone.layers.{bid}.mixer.dt_proj",
-        ),
-
-        MODEL_TENSOR.SSM_A: (
-            "model.layers.{bid}.A_log",
-            "backbone.layers.{bid}.mixer.A_log",
-        ),
-
-        MODEL_TENSOR.SSM_D: (
-            "model.layers.{bid}.D",
-            "backbone.layers.{bid}.mixer.D",
-        ),
-
-        MODEL_TENSOR.SSM_OUT: (
-            "model.layers.{bid}.out_proj",
-            "backbone.layers.{bid}.mixer.out_proj",
-        ),
-
-        MODEL_TENSOR.ATTN_Q_A: (
-            "model.layers.{bid}.self_attn.q_a_proj", # deepseek2
-        ),
-
-        MODEL_TENSOR.ATTN_Q_B: (
-            "model.layers.{bid}.self_attn.q_b_proj", # deepseek2
-        ),
-
-        MODEL_TENSOR.ATTN_KV_A_MQA: (
-            "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
-        ),
-
-        MODEL_TENSOR.ATTN_KV_B: (
-            "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
-        ),
-
-        MODEL_TENSOR.ATTN_Q_A_NORM: (
-            "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
-        ),
-
-        MODEL_TENSOR.ATTN_KV_A_NORM: (
-            "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
-        ),
-
-        MODEL_TENSOR.ATTN_SUB_NORM: (
-            "model.layers.{bid}.self_attn.inner_attn_ln",  # bitnet
-        ),
-
-        MODEL_TENSOR.FFN_SUB_NORM: (
-            "model.layers.{bid}.mlp.ffn_layernorm",  # bitnet
-        ),
-
-        MODEL_TENSOR.DEC_ATTN_NORM: (
-            "decoder.block.{bid}.layer.0.layer_norm", # t5
-        ),
-
-        MODEL_TENSOR.DEC_ATTN_Q: (
-            "decoder.block.{bid}.layer.0.SelfAttention.q", # t5
-        ),
-
-        MODEL_TENSOR.DEC_ATTN_K: (
-            "decoder.block.{bid}.layer.0.SelfAttention.k", # t5
-        ),
-
-        MODEL_TENSOR.DEC_ATTN_V: (
-            "decoder.block.{bid}.layer.0.SelfAttention.v", # t5
-        ),
-
-        MODEL_TENSOR.DEC_ATTN_OUT: (
-            "decoder.block.{bid}.layer.0.SelfAttention.o", # t5
-        ),
-
-        MODEL_TENSOR.DEC_ATTN_REL_B: (
-            "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
-        ),
-
-        MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
-            "decoder.block.{bid}.layer.1.layer_norm", # t5
-        ),
-
-        MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
-            "decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
-        ),
-
-        MODEL_TENSOR.DEC_CROSS_ATTN_K: (
-            "decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
-        ),
-
-        MODEL_TENSOR.DEC_CROSS_ATTN_V: (
-            "decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
-        ),
-
-        MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
-            "decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
-        ),
-
-        MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
-            "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
-        ),
-
-        MODEL_TENSOR.DEC_FFN_NORM: (
-            "decoder.block.{bid}.layer.2.layer_norm", # t5
-        ),
-
-        MODEL_TENSOR.DEC_FFN_GATE: (
-            "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
-        ),
-
-        MODEL_TENSOR.DEC_FFN_UP: (
-            "decoder.block.{bid}.layer.2.DenseReluDense.wi",   # t5
-            "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
-        ),
-
-        MODEL_TENSOR.DEC_FFN_DOWN: (
-            "decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
-        ),
-
-        MODEL_TENSOR.DEC_OUTPUT_NORM: (
-            "decoder.final_layer_norm", # t5
-        ),
-
-        MODEL_TENSOR.ENC_ATTN_NORM: (
-            "encoder.block.{bid}.layer.0.layer_norm", # t5
-        ),
-
-        MODEL_TENSOR.ENC_ATTN_Q: (
-            "encoder.block.{bid}.layer.0.SelfAttention.q", # t5
-        ),
-
-        MODEL_TENSOR.ENC_ATTN_K: (
-            "encoder.block.{bid}.layer.0.SelfAttention.k", # t5
-        ),
-
-        MODEL_TENSOR.ENC_ATTN_V: (
-            "encoder.block.{bid}.layer.0.SelfAttention.v", # t5
-        ),
-
-        MODEL_TENSOR.ENC_ATTN_OUT: (
-            "encoder.block.{bid}.layer.0.SelfAttention.o", # t5
-        ),
-
-        MODEL_TENSOR.ENC_ATTN_REL_B: (
-            "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
-        ),
-
-        MODEL_TENSOR.ENC_FFN_NORM: (
-            "encoder.block.{bid}.layer.1.layer_norm", # t5
-        ),
-
-        MODEL_TENSOR.ENC_FFN_GATE: (
-            "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
-        ),
-
-        MODEL_TENSOR.ENC_FFN_UP: (
-            "encoder.block.{bid}.layer.1.DenseReluDense.wi",   # t5
-            "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
-        ),
-
-        MODEL_TENSOR.ENC_FFN_DOWN: (
-            "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
-        ),
-
-        MODEL_TENSOR.ENC_OUTPUT_NORM: (
-            "encoder.final_layer_norm", # t5
-        ),
-    }
-
-    # architecture-specific block mappings
-    arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
-        MODEL_ARCH.ARCTIC: {
-            MODEL_TENSOR.FFN_NORM: (
-                "model.layers.{bid}.residual_layernorm",
-            ),
-            MODEL_TENSOR.FFN_NORM_EXP: (
-                "model.layers.{bid}.post_attention_layernorm",
-            ),
-        },
-    }
-
-    mapping: dict[str, tuple[MODEL_TENSOR, str]]
-
-    def __init__(self, arch: MODEL_ARCH, n_blocks: int):
-        self.mapping = {}
-        for tensor, keys in self.mappings_cfg.items():
-            if tensor not in MODEL_TENSORS[arch]:
-                continue
-            tensor_name = TENSOR_NAMES[tensor]
-            self.mapping[tensor_name] = (tensor, tensor_name)
-            for key in keys:
-                self.mapping[key] = (tensor, tensor_name)
-        if arch in self.arch_block_mappings_cfg:
-            self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch])
-        for bid in range(n_blocks):
-            for tensor, keys in self.block_mappings_cfg.items():
-                if tensor not in MODEL_TENSORS[arch]:
-                    continue
-
-                tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
-                self.mapping[tensor_name] = (tensor, tensor_name)
-                for key in keys:
-                    key = key.format(bid = bid)
-                    self.mapping[key] = (tensor, tensor_name)
-
-    def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
-        result = self.mapping.get(key)
-        if result is not None:
-            return result
-        for suffix in try_suffixes:
-            if key.endswith(suffix):
-                result = self.mapping.get(key[:-len(suffix)])
-                if result is not None:
-                    return result[0], result[1] + suffix
-        return None
-
-    def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
-        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
-        if result is None:
-            return None
-        return result[1]
-
-    def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
-        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
-        if result is None:
-            return None
-        return result[0]
-
-    def __getitem__(self, key: str) -> str:
-        try:
-            return self.mapping[key][1]
-        except KeyError:
-            raise KeyError(key)
-
-    def __contains__(self, key: str) -> bool:
-        return key in self.mapping
-
-    def __repr__(self) -> str:
-        return repr(self.mapping)
-
-
-def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
-    return TensorNameMap(arch, n_blocks)
--- a/src/gguf-py/gguf/utility.py
+++ b/src/gguf-py/gguf/utility.py
@ -1,69 +0,0 @@
-from __future__ import annotations
-
-from typing import Literal
-
-
-def fill_templated_filename(filename: str, output_type: str | None) -> str:
-    # Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf'
-    ftype_lowercase: str = output_type.lower() if output_type is not None else ""
-    ftype_uppercase: str = output_type.upper() if output_type is not None else ""
-    return filename.format(ftype_lowercase,
-                           outtype=ftype_lowercase, ftype=ftype_lowercase,
-                           OUTTYPE=ftype_uppercase, FTYPE=ftype_uppercase)
-
-
-def model_weight_count_rounded_notation(model_params_count: int, min_digits: int = 2) -> str:
-    if model_params_count > 1e12 :
-        # Trillions Of Parameters
-        scaled_model_params = model_params_count * 1e-12
-        scale_suffix = "T"
-    elif model_params_count > 1e9 :
-        # Billions Of Parameters
-        scaled_model_params = model_params_count * 1e-9
-        scale_suffix = "B"
-    elif model_params_count > 1e6 :
-        # Millions Of Parameters
-        scaled_model_params = model_params_count * 1e-6
-        scale_suffix = "M"
-    else:
-        # Thousands Of Parameters
-        scaled_model_params = model_params_count * 1e-3
-        scale_suffix = "K"
-
-    fix = max(min_digits - len(str(round(scaled_model_params)).lstrip('0')), 0)
-
-    return f"{scaled_model_params:.{fix}f}{scale_suffix}"
-
-
-def size_label(total_params: int, shared_params: int, expert_params: int, expert_count: int) -> str:
-
-    if expert_count > 0:
-        pretty_size = model_weight_count_rounded_notation(abs(shared_params) + abs(expert_params), min_digits=2)
-        size_class = f"{expert_count}x{pretty_size}"
-    else:
-        size_class = model_weight_count_rounded_notation(abs(total_params), min_digits=2)
-
-    return size_class
-
-
-def naming_convention(model_name: str | None, base_name: str | None, finetune_string: str | None, version_string: str | None, size_label: str | None, output_type: str | None, model_type: Literal['vocab', 'LoRA'] | None = None) -> str:
-    # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
-
-    if base_name is not None:
-        name = base_name.strip().replace(' ', '-').replace('/', '-')
-    elif model_name is not None:
-        name = model_name.strip().replace(' ', '-').replace('/', '-')
-    else:
-        name = "ggml-model"
-
-    parameters = f"-{size_label}" if size_label is not None else ""
-
-    finetune = f"-{finetune_string.strip().replace(' ', '-')}" if finetune_string is not None else ""
-
-    version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else ""
-
-    encoding = f"-{output_type.strip().replace(' ', '-').upper()}" if output_type is not None else ""
-
-    kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""
-
-    return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
--- a/src/gguf-py/gguf/init.py
+++ b/src/gguf-py/gguf/init.py
--- a/src/gguf/constants.py
+++ b/src/gguf/constants.py
--- a/src/gguf-py/gguf/gguf.py
+++ b/src/gguf-py/gguf/gguf.py
@ -1,7 +1,3 @@
-# This file left for compatibility. If you want to use the GGUF API from Python
-# then don't import gguf/gguf.py directly. If you're looking for examples, see the
-# examples/ directory for gguf-py
-
 import importlib
 import sys
 from pathlib import Path
--- a/src/gguf-py/gguf/gguf_reader.py
+++ b/src/gguf-py/gguf/gguf_reader.py
@ -1,11 +1,8 @@
-#
-# GGUF file reading/modification support. For API usage information,
-# please see the files scripts/ for some fairly simple examples.
-#
 from __future__ import annotations

 import logging
 import os
+import sys
 from collections import OrderedDict
 from typing import Any, Literal, NamedTuple, TypeVar, Union

@ -15,7 +12,6 @@
 from .quants import quant_shape_to_byte_shape

 if __name__ == "__main__":
-    import sys
    from pathlib import Path

    # Allow running file in package as a script.
@ -28,6 +24,7 @@
    GGUF_VERSION,
    GGMLQuantizationType,
    GGUFValueType,
+    GGUFEndian,
 )

 logger = logging.getLogger(__name__)
@ -53,6 +50,52 @@ class ReaderField(NamedTuple):

    types: list[GGUFValueType] = []

+    def contents(self, index_or_slice: int | slice = slice(None)) -> Any:
+        if self.types:
+            to_string = lambda x: str(x.tobytes(), encoding="utf-8")  # noqa: E731
+            main_type = self.types[0]
+
+            if main_type == GGUFValueType.ARRAY:
+                sub_type = self.types[-1]
+
+                if sub_type == GGUFValueType.STRING:
+                    indices = self.data[index_or_slice]
+
+                    if isinstance(index_or_slice, int):
+                        return to_string(self.parts[indices])  # type: ignore
+                    else:
+                        return [to_string(self.parts[idx]) for idx in indices]  # type: ignore
+                else:
+                    # FIXME: When/if _get_field_parts() support multi-dimensional arrays, this must do so too
+
+                    # Check if it's unsafe to perform slice optimization on data
+                    # if any(True for idx in self.data if len(self.parts[idx]) != 1):
+                    #     optim_slice = slice(None)
+                    # else:
+                    #     optim_slice = index_or_slice
+                    #     index_or_slice = slice(None)
+
+                    # if isinstance(optim_slice, int):
+                    #     return self.parts[self.data[optim_slice]].tolist()[0]
+                    # else:
+                    #     return [pv for idx in self.data[optim_slice] for pv in self.parts[idx].tolist()][index_or_slice]
+
+                    if isinstance(index_or_slice, int):
+                        return self.parts[self.data[index_or_slice]].tolist()[0]
+                    else:
+                        return [
+                            pv
+                            for idx in self.data[index_or_slice]
+                            for pv in self.parts[idx].tolist()
+                        ]
+
+            if main_type == GGUFValueType.STRING:
+                return to_string(self.parts[-1])
+            else:
+                return self.parts[-1].tolist()[0]
+
+        return None
+

 class ReaderTensor(NamedTuple):
    name: str
@ -67,32 +110,34 @@ class ReaderTensor(NamedTuple):

 class GGUFReader:
    # I - same as host, S - swapped
-    byte_order: Literal['I', 'S'] = 'I'
+    byte_order: Literal["I", "S"] = "I"
    alignment: int = GGUF_DEFAULT_ALIGNMENT
    data_offset: int

    # Note: Internal helper, API may change.
    gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
-        GGUFValueType.UINT8:   np.uint8,
-        GGUFValueType.INT8:    np.int8,
-        GGUFValueType.UINT16:  np.uint16,
-        GGUFValueType.INT16:   np.int16,
-        GGUFValueType.UINT32:  np.uint32,
-        GGUFValueType.INT32:   np.int32,
+        GGUFValueType.UINT8: np.uint8,
+        GGUFValueType.INT8: np.int8,
+        GGUFValueType.UINT16: np.uint16,
+        GGUFValueType.INT16: np.int16,
+        GGUFValueType.UINT32: np.uint32,
+        GGUFValueType.INT32: np.int32,
        GGUFValueType.FLOAT32: np.float32,
-        GGUFValueType.UINT64:  np.uint64,
-        GGUFValueType.INT64:   np.int64,
+        GGUFValueType.UINT64: np.uint64,
+        GGUFValueType.INT64: np.int64,
        GGUFValueType.FLOAT64: np.float64,
-        GGUFValueType.BOOL:    np.bool_,
+        GGUFValueType.BOOL: np.bool_,
    }

-    def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] = 'r'):
-        self.data = np.memmap(path, mode = mode)
+    def __init__(
+        self, path: os.PathLike[str] | str, mode: Literal["r", "r+", "c"] = "r"
+    ):
+        self.data = np.memmap(path, mode=mode)
        offs = 0

        # Check for GGUF magic
-        if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
-            raise ValueError('GGUF magic invalid')
+        if self._get(offs, np.uint32, override_order="<")[0] != GGUF_MAGIC:
+            raise ValueError("GGUF magic invalid")
        offs += 4

        # Check GGUF version
@ -100,28 +145,57 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] =
        if temp_version[0] & 65535 == 0:
            # If we get 0 here that means it's (probably) a GGUF file created for
            # the opposite byte order of the machine this script is running on.
-            self.byte_order = 'S'
-            temp_version = temp_version.newbyteorder(self.byte_order)
+            self.byte_order = "S"
+            temp_version = temp_version.view(
+                temp_version.dtype.newbyteorder(self.byte_order)
+            )
        version = temp_version[0]
        if version not in READER_SUPPORTED_VERSIONS:
-            raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle')
+            raise ValueError(
+                f"Sorry, file appears to be version {version} which we cannot handle"
+            )
+        if sys.byteorder == "little":
+            # Host is little endian
+            host_endian = GGUFEndian.LITTLE
+            swapped_endian = GGUFEndian.BIG
+        else:
+            # Sorry PDP or other weird systems that don't use BE or LE.
+            host_endian = GGUFEndian.BIG
+            swapped_endian = GGUFEndian.LITTLE
+        self.endianess = swapped_endian if self.byte_order == "S" else host_endian
        self.fields: OrderedDict[str, ReaderField] = OrderedDict()
        self.tensors: list[ReaderTensor] = []
-        offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
+        offs += self._push_field(
+            ReaderField(
+                offs, "GGUF.version", [temp_version], [0], [GGUFValueType.UINT32]
+            )
+        )

        # Check tensor count and kv count
        temp_counts = self._get(offs, np.uint64, 2)
-        offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
-        offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
+        offs += self._push_field(
+            ReaderField(
+                offs,
+                "GGUF.tensor_count",
+                [temp_counts[:1]],
+                [0],
+                [GGUFValueType.UINT64],
+            )
+        )
+        offs += self._push_field(
+            ReaderField(
+                offs, "GGUF.kv_count", [temp_counts[1:]], [0], [GGUFValueType.UINT64]
+            )
+        )
        tensor_count, kv_count = temp_counts
        offs = self._build_fields(offs, kv_count)

        # Build Tensor Info Fields
        offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
-        new_align = self.fields.get('general.alignment')
+        new_align = self.fields.get("general.alignment")
        if new_align is not None:
            if new_align.types != [GGUFValueType.UINT32]:
-                raise ValueError('Bad type for general.alignment field')
+                raise ValueError("Bad type for general.alignment field")
            self.alignment = new_align.parts[-1][0]
        padding = offs % self.alignment
        if padding != 0:
@ -129,7 +203,7 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] =
        self.data_offset = offs
        self._build_tensors(offs, tensors_fields)

-    _DT = TypeVar('_DT', bound = npt.DTypeLike)
+    _DT = TypeVar("_DT", bound=npt.DTypeLike)

    # Fetch a key/value metadata field by key.
    def get_field(self, key: str) -> Union[ReaderField, None]:
@ -140,15 +214,20 @@ def get_tensor(self, idx: int) -> ReaderTensor:
        return self.tensors[idx]

    def _get(
-        self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I', 'S', '<'] = None,
+        self,
+        offset: int,
+        dtype: npt.DTypeLike,
+        count: int = 1,
+        override_order: None | Literal["I", "S", "<"] = None,
    ) -> npt.NDArray[Any]:
        count = int(count)
-        itemsize = int(np.empty([], dtype = dtype).itemsize)
+        itemsize = int(np.empty([], dtype=dtype).itemsize)
        end_offs = offset + itemsize * count
-        return (
-            self.data[offset:end_offs]
-            .view(dtype = dtype)[:count]
-            .newbyteorder(override_order or self.byte_order)
+        arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
+        return arr.view(
+            arr.dtype.newbyteorder(
+                self.byte_order if override_order is None else override_order
+            )
        )

    def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
@ -156,18 +235,22 @@ def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
            # TODO: add option to generate error on duplicate keys
            # raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')

-            logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
-            self.fields[field.name + '_{}'.format(field.offset)] = field
+            logger.warning(f"Duplicate key {field.name} at offset {field.offset}")
+            self.fields[field.name + "_{}".format(field.offset)] = field
        else:
            self.fields[field.name] = field
        return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts)

-    def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]:
+    def _get_str(
+        self, offset: int
+    ) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]:
        slen = self._get(offset, np.uint64)
        return slen, self._get(offset + 8, np.uint8, slen[0])

    def _get_field_parts(
-        self, orig_offs: int, raw_type: int,
+        self,
+        orig_offs: int,
+        raw_type: int,
    ) -> tuple[int, list[npt.NDArray[Any]], list[int], list[GGUFValueType]]:
        offs = orig_offs
        types: list[GGUFValueType] = []
@ -191,8 +274,11 @@ def _get_field_parts(
            offs += int(alen.nbytes)
            aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
            data_idxs: list[int] = []
+            # FIXME: Handle multi-dimensional arrays properly instead of flattening
            for idx in range(alen[0]):
-                curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0])
+                curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(
+                    offs, raw_itype[0]
+                )
                if idx == 0:
                    types += curr_types
                idxs_offs = len(aparts)
@ -201,7 +287,7 @@ def _get_field_parts(
                offs += curr_size
            return offs - orig_offs, aparts, data_idxs, types
        # We can't deal with this one.
-        raise ValueError('Unknown/unhandled field type {gtype}')
+        raise ValueError("Unknown/unhandled field type {gtype}")

    def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
        offs = orig_offs
@ -228,7 +314,7 @@ def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:

        return ReaderField(
            orig_offs,
-            str(bytes(name_data), encoding = 'utf-8'),
+            str(bytes(name_data), encoding="utf-8"),
            [name_len, name_data, n_dims, dims, raw_dtype, offset_tensor],
            [1, 3, 4, 5],
        )
@ -242,19 +328,26 @@ def _build_fields(self, offs: int, count: int) -> int:
            offs += int(raw_kv_type.nbytes)
            parts: list[npt.NDArray[Any]] = [kv_klen, kv_kdata, raw_kv_type]
            idxs_offs = len(parts)
-            field_size, field_parts, field_idxs, field_types = self._get_field_parts(offs, raw_kv_type[0])
+            field_size, field_parts, field_idxs, field_types = self._get_field_parts(
+                offs, raw_kv_type[0]
+            )
            parts += field_parts
-            self._push_field(ReaderField(
-                orig_offs,
-                str(bytes(kv_kdata), encoding = 'utf-8'),
-                parts,
-                [idx + idxs_offs for idx in field_idxs],
-                field_types,
-            ), skip_sum = True)
+            self._push_field(
+                ReaderField(
+                    orig_offs,
+                    str(bytes(kv_kdata), encoding="utf-8"),
+                    parts,
+                    [idx + idxs_offs for idx in field_idxs],
+                    field_types,
+                ),
+                skip_sum=True,
+            )
            offs += field_size
        return offs

-    def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
+    def _build_tensor_info(
+        self, offs: int, count: int
+    ) -> tuple[int, list[ReaderField]]:
        tensor_fields = []
        for _ in range(count):
            field = self._get_tensor_info_field(offs)
@ -264,13 +357,13 @@ def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderFie

    def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
        tensors = []
-        tensor_names = set() # keep track of name to prevent duplicated tensors
+        tensor_names = set()  # keep track of name to prevent duplicated tensors
        for field in fields:
            _name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts
            # check if there's any tensor having same name already in the list
-            tensor_name = str(bytes(name_data), encoding = 'utf-8')
+            tensor_name = str(bytes(name_data), encoding="utf-8")
            if tensor_name in tensor_names:
-                raise ValueError(f'Found duplicated tensor with name {tensor_name}')
+                raise ValueError(f"Found duplicated tensor with name {tensor_name}")
            tensor_names.add(tensor_name)
            ggml_type = GGMLQuantizationType(raw_dtype[0])
            n_elems = int(np.prod(dims))
@ -304,14 +397,16 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
                item_count = n_bytes
                item_type = np.uint8
                np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
-            tensors.append(ReaderTensor(
-                name = tensor_name,
-                tensor_type = ggml_type,
-                shape = dims,
-                n_elements = n_elems,
-                n_bytes = n_bytes,
-                data_offset = data_offs,
-                data = self._get(data_offs, item_type, item_count).reshape(np_dims),
-                field = field,
-            ))
+            tensors.append(
+                ReaderTensor(
+                    name=tensor_name,
+                    tensor_type=ggml_type,
+                    shape=dims,
+                    n_elements=n_elems,
+                    n_bytes=n_bytes,
+                    data_offset=data_offs,
+                    data=self._get(data_offs, item_type, item_count).reshape(np_dims),
+                    field=field,
+                )
+            )
        self.tensors = tensors
--- a/src/gguf-py/gguf/gguf_writer.py
+++ b/src/gguf-py/gguf/gguf_writer.py
@ -26,6 +26,7 @@
    RopeScalingType,
    PoolingType,
    TokenType,
+    ExpertGatingFuncType,
 )

 from .quants import quant_shape_from_byte_shape
@ -52,8 +53,8 @@ class GGUFValue:

 class WriterState(Enum):
    NO_FILE = auto()
-    EMPTY   = auto()
-    HEADER  = auto()
+    EMPTY = auto()
+    HEADER = auto()
    KV_DATA = auto()
    TI_DATA = auto()
    WEIGHTS = auto()
@ -67,22 +68,29 @@ class GGUFWriter:
    kv_data: list[dict[str, GGUFValue]]
    state: WriterState
    _simple_value_packing = {
-        GGUFValueType.UINT8:   "B",
-        GGUFValueType.INT8:    "b",
-        GGUFValueType.UINT16:  "H",
-        GGUFValueType.INT16:   "h",
-        GGUFValueType.UINT32:  "I",
-        GGUFValueType.INT32:   "i",
+        GGUFValueType.UINT8: "B",
+        GGUFValueType.INT8: "b",
+        GGUFValueType.UINT16: "H",
+        GGUFValueType.INT16: "h",
+        GGUFValueType.UINT32: "I",
+        GGUFValueType.INT32: "i",
        GGUFValueType.FLOAT32: "f",
-        GGUFValueType.UINT64:  "Q",
-        GGUFValueType.INT64:   "q",
+        GGUFValueType.UINT64: "Q",
+        GGUFValueType.INT64: "q",
        GGUFValueType.FLOAT64: "d",
-        GGUFValueType.BOOL:    "?",
+        GGUFValueType.BOOL: "?",
    }

    def __init__(
-        self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
-        split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
+        self,
+        path: os.PathLike[str] | str | None,
+        arch: str,
+        use_temp_file: bool = False,
+        endianess: GGUFEndian = GGUFEndian.LITTLE,
+        split_max_tensors: int = 0,
+        split_max_size: int = 0,
+        dry_run: bool = False,
+        small_first_shard: bool = False,
    ):
        self.fout = None
        self.path = Path(path) if path else None
@ -97,9 +105,11 @@ def __init__(
        self.split_max_size = split_max_size
        self.dry_run = dry_run
        self.small_first_shard = small_first_shard
-        logger.info("gguf: This GGUF file is for {0} Endian only".format(
-            "Big" if self.endianess == GGUFEndian.BIG else "Little",
-        ))
+        logger.info(
+            "gguf: This GGUF file is for {0} Endian only".format(
+                "Big" if self.endianess == GGUFEndian.BIG else "Little",
+            )
+        )
        self.state = WriterState.NO_FILE

        if self.small_first_shard:
@ -128,7 +138,9 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]:
                elif name.endswith(".lora_b"):
                    if last_lora_a is None or last_lora_a[0] != name[:-1] + "a":
                        # Bail when the LoRA pair can't be found trivially
-                        logger.warning("can't measure LoRA size correctly, tensor order is unusual")
+                        logger.warning(
+                            "can't measure LoRA size correctly, tensor order is unusual"
+                        )
                        return 0, 0, 0, 0
                    else:
                        shape = (*shape[:-1], last_lora_a[1].shape[-1])
@ -136,7 +148,7 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]:
                size = prod(shape)

                if "_exps." in name:
-                    expert_params += (size // shape[-3])
+                    expert_params += size // shape[-3]
                    expert_sum += shape[-3]
                    n_expert_tensors += 1
                else:
@ -157,15 +169,26 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]:
    def format_shard_names(self, path: Path) -> list[Path]:
        if len(self.tensors) == 1:
            return [path]
-        return [path.with_name(SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))) for i in range(len(self.tensors))]
+        return [
+            path.with_name(
+                SHARD_NAME_FORMAT.format(path.stem, i + 1, len(self.tensors))
+            )
+            for i in range(len(self.tensors))
+        ]

    def open_output_file(self, path: Path | None = None) -> None:
-        if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
+        if (
+            self.state is WriterState.EMPTY
+            and self.fout is not None
+            and (path is None or path == self.path)
+        ):
            # allow calling this multiple times as long as the path is the same
            return

        if self.state is not WriterState.NO_FILE:
-            raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
+            raise ValueError(
+                f"Expected output file to be not yet opened, got {self.state}"
+            )

        if path is not None:
            self.path = path
@ -181,7 +204,9 @@ def print_plan(self) -> list[Path]:
        filenames = self.format_shard_names(self.path)
        assert len(filenames) == len(self.tensors)
        for name, tensors in zip(filenames, self.tensors):
-            logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")
+            logger.info(
+                f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}"
+            )

        if self.dry_run:
            logger.info("Dry run, not writing files")
@ -201,17 +226,23 @@ def add_shard_kv_data(self) -> None:
        self.kv_data.extend({} for _ in range(len(self.kv_data), total_splits))
        for i, kv_data in enumerate(self.kv_data):
            kv_data[Keys.Split.LLM_KV_SPLIT_NO] = GGUFValue(i, GGUFValueType.UINT16)
-            kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(total_splits, GGUFValueType.UINT16)
-            kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(total_tensors, GGUFValueType.INT32)
+            kv_data[Keys.Split.LLM_KV_SPLIT_COUNT] = GGUFValue(
+                total_splits, GGUFValueType.UINT16
+            )
+            kv_data[Keys.Split.LLM_KV_SPLIT_TENSORS_COUNT] = GGUFValue(
+                total_tensors, GGUFValueType.INT32
+            )

    def write_header_to_file(self, path: Path | None = None) -> None:
-        if len(self.tensors) == 1 and (self.split_max_tensors != 0 or self.split_max_size != 0):
+        if len(self.tensors) == 1 and (
+            self.split_max_tensors != 0 or self.split_max_size != 0
+        ):
            logger.warning("Model fails split requirements, not splitting")

        self.open_output_file(path)

        if self.state is not WriterState.EMPTY:
-            raise ValueError(f'Expected output file to be empty, got {self.state}')
+            raise ValueError(f"Expected output file to be empty, got {self.state}")

        assert self.fout is not None
        assert len(self.fout) == len(self.tensors)
@ -220,7 +251,7 @@ def write_header_to_file(self, path: Path | None = None) -> None:
        self.add_shard_kv_data()

        for fout, tensors, kv_data in zip(self.fout, self.tensors, self.kv_data):
-            fout.write(self._pack("<I", GGUF_MAGIC, skip_pack_prefix = True))
+            fout.write(self._pack("<I", GGUF_MAGIC, skip_pack_prefix=True))
            fout.write(self._pack("I", GGUF_VERSION))
            fout.write(self._pack("Q", len(tensors)))
            fout.write(self._pack("Q", len(kv_data)))
@ -229,7 +260,9 @@ def write_header_to_file(self, path: Path | None = None) -> None:

    def write_kv_data_to_file(self) -> None:
        if self.state is not WriterState.HEADER:
-            raise ValueError(f'Expected output file to contain the header, got {self.state}')
+            raise ValueError(
+                f"Expected output file to contain the header, got {self.state}"
+            )
        assert self.fout is not None

        for fout, kv_data in zip(self.fout, self.kv_data):
@ -246,7 +279,9 @@ def write_kv_data_to_file(self) -> None:

    def write_ti_data_to_file(self) -> None:
        if self.state is not WriterState.KV_DATA:
-            raise ValueError(f'Expected output file to contain KV data, got {self.state}')
+            raise ValueError(
+                f"Expected output file to contain KV data, got {self.state}"
+            )
        assert self.fout is not None

        for fout, tensors in zip(self.fout, self.tensors):
@ -269,12 +304,12 @@ def write_ti_data_to_file(self) -> None:

    def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
        if any(key in kv_data for kv_data in self.kv_data):
-            raise ValueError(f'Duplicated key name {key!r}')
+            raise ValueError(f"Duplicated key name {key!r}")

        self.kv_data[0][key] = GGUFValue(value=val, type=vtype)

    def add_uint8(self, key: str, val: int) -> None:
-        self.add_key_value(key,val, GGUFValueType.UINT8)
+        self.add_key_value(key, val, GGUFValueType.UINT8)

    def add_int8(self, key: str, val: int) -> None:
        self.add_key_value(key, val, GGUFValueType.INT8)
@ -321,14 +356,20 @@ def ggml_pad(x: int, n: int) -> int:
        return ((x + n - 1) // n) * n

    def add_tensor_info(
-        self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
-        tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
+        self,
+        name: str,
+        tensor_shape: Sequence[int],
+        tensor_dtype: np.dtype,
+        tensor_nbytes: int,
+        raw_dtype: GGMLQuantizationType | None = None,
    ) -> None:
        if self.state is not WriterState.NO_FILE:
-            raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
+            raise ValueError(
+                f"Expected output file to be not yet opened, got {self.state}"
+            )

        if any(name in tensors for tensors in self.tensors):
-            raise ValueError(f'Duplicated tensor name {name!r}')
+            raise ValueError(f"Duplicated tensor name {name!r}")

        if raw_dtype is None:
            if tensor_dtype == np.float16:
@ -346,7 +387,9 @@ def add_tensor_info(
            elif tensor_dtype == np.int64:
                dtype = GGMLQuantizationType.I64
            else:
-                raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
+                raise ValueError(
+                    "Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now"
+                )
        else:
            dtype = raw_dtype
            if tensor_dtype == np.uint8:
@ -357,16 +400,22 @@ def add_tensor_info(
            if (  # split when over tensor limit
                self.split_max_tensors != 0
                and len(self.tensors[-1]) >= self.split_max_tensors
-            ) or (   # split when over size limit
+            ) or (  # split when over size limit
                self.split_max_size != 0
-                and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size
+                and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes
+                > self.split_max_size
            ):
                self.tensors.append({})

-        self.tensors[-1][name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
+        self.tensors[-1][name] = TensorInfo(
+            shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes
+        )

    def add_tensor(
-        self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
+        self,
+        name: str,
+        tensor: np.ndarray[Any, Any],
+        raw_shape: Sequence[int] | None = None,
        raw_dtype: GGMLQuantizationType | None = None,
    ) -> None:
        if self.endianess == GGUFEndian.BIG:
@ -377,7 +426,9 @@ def add_tensor(
            self.temp_file = fp

        shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
-        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
+        self.add_tensor_info(
+            name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype
+        )

        if self.temp_file is None:
            self.tensors[-1][name].tensor = tensor
@ -387,13 +438,21 @@ def add_tensor(
        self.write_padding(self.temp_file, tensor.nbytes)

    def write_padding(self, fp: IO[bytes], n: int, align: int | None = None) -> None:
-        pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
+        pad = (
+            GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment)
+            - n
+        )
        if pad != 0:
            fp.write(bytes([0] * pad))

    def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
-        if self.state is not WriterState.TI_DATA and self.state is not WriterState.WEIGHTS:
-            raise ValueError(f'Expected output file to contain tensor info or weights, got {self.state}')
+        if (
+            self.state is not WriterState.TI_DATA
+            and self.state is not WriterState.WEIGHTS
+        ):
+            raise ValueError(
+                f"Expected output file to contain tensor info or weights, got {self.state}"
+            )
        assert self.fout is not None

        if self.endianess == GGUFEndian.BIG:
@ -409,7 +468,9 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:

        # pop the first tensor info
        # TODO: cleaner way to get the first key
-        first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
+        first_tensor_name = [
+            name for name, _ in zip(self.tensors[file_id].keys(), range(1))
+        ][0]
        ti = self.tensors[file_id].pop(first_tensor_name)
        assert ti.nbytes == tensor.nbytes

@ -437,8 +498,15 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None:
                total_bytes = sum(ti.nbytes for t in self.tensors for ti in t.values())

                if len(self.fout) > 1:
-                    shard_bar = tqdm(desc=f"Shard (0/{len(self.fout)})", total=None, unit="byte", unit_scale=True)
-                bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
+                    shard_bar = tqdm(
+                        desc=f"Shard (0/{len(self.fout)})",
+                        total=None,
+                        unit="byte",
+                        unit_scale=True,
+                    )
+                bar = tqdm(
+                    desc="Writing", total=total_bytes, unit="byte", unit_scale=True
+                )

            for i, (fout, tensors) in enumerate(zip(self.fout, self.tensors)):
                if shard_bar is not None:
@ -448,7 +516,9 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None:

                # relying on the fact that Python dicts preserve insertion order (since 3.7)
                for ti in tensors.values():
-                    assert ti.tensor is not None  # can only iterate once over the tensors
+                    assert (
+                        ti.tensor is not None
+                    )  # can only iterate once over the tensors
                    assert ti.tensor.nbytes == ti.nbytes
                    ti.tensor.tofile(fout)
                    if shard_bar is not None:
@ -460,7 +530,9 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None:
        else:
            self.temp_file.seek(0)

-            shutil.copyfileobj(self.temp_file, self.fout[0 if not self.small_first_shard else 1])
+            shutil.copyfileobj(
+                self.temp_file, self.fout[0 if not self.small_first_shard else 1]
+            )
            self.flush()
            self.temp_file.close()

@ -566,7 +638,14 @@ def add_base_model_version(self, source_id: int, version: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_VERSION.format(id=source_id), version)

    def add_base_model_organization(self, source_id: int, organization: str) -> None:
-        self.add_string(Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization)
+        self.add_string(
+            Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization
+        )
+
+    def add_base_model_description(self, source_id: int, description: str) -> None:
+        self.add_string(
+            Keys.General.BASE_MODEL_DESCRIPTION.format(id=source_id), description
+        )

    def add_base_model_url(self, source_id: int, url: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url)
@ -580,15 +659,46 @@ def add_base_model_uuid(self, source_id: int, uuid: str) -> None:
    def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None:
        self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url)

+    def add_dataset_count(self, source_count: int) -> None:
+        self.add_uint32(Keys.General.DATASET_COUNT, source_count)
+
+    def add_dataset_name(self, source_id: int, name: str) -> None:
+        self.add_string(Keys.General.DATASET_NAME.format(id=source_id), name)
+
+    def add_dataset_author(self, source_id: int, author: str) -> None:
+        self.add_string(Keys.General.DATASET_AUTHOR.format(id=source_id), author)
+
+    def add_dataset_version(self, source_id: int, version: str) -> None:
+        self.add_string(Keys.General.DATASET_VERSION.format(id=source_id), version)
+
+    def add_dataset_organization(self, source_id: int, organization: str) -> None:
+        self.add_string(
+            Keys.General.DATASET_ORGANIZATION.format(id=source_id), organization
+        )
+
+    def add_dataset_description(self, source_id: int, description: str) -> None:
+        self.add_string(
+            Keys.General.DATASET_DESCRIPTION.format(id=source_id), description
+        )
+
+    def add_dataset_url(self, source_id: int, url: str) -> None:
+        self.add_string(Keys.General.DATASET_URL.format(id=source_id), url)
+
+    def add_dataset_doi(self, source_id: int, doi: str) -> None:
+        self.add_string(Keys.General.DATASET_DOI.format(id=source_id), doi)
+
+    def add_dataset_uuid(self, source_id: int, uuid: str) -> None:
+        self.add_string(Keys.General.DATASET_UUID.format(id=source_id), uuid)
+
+    def add_dataset_repo_url(self, source_id: int, repo_url: str) -> None:
+        self.add_string(Keys.General.DATASET_REPO_URL.format(id=source_id), repo_url)
+
    def add_tags(self, tags: Sequence[str]) -> None:
        self.add_array(Keys.General.TAGS, tags)

    def add_languages(self, languages: Sequence[str]) -> None:
        self.add_array(Keys.General.LANGUAGES, languages)

-    def add_datasets(self, datasets: Sequence[str]) -> None:
-        self.add_array(Keys.General.DATASETS, datasets)
-
    def add_tensor_data_layout(self, layout: str) -> None:
        self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)

@ -601,11 +711,28 @@ def add_context_length(self, length: int) -> None:
    def add_embedding_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)

+    def add_features_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
+
+    def add_posnet_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_posnet_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_convnext_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_convnext_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)
+
    def add_block_count(self, length: int) -> None:
        self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)

    def add_leading_dense_block_count(self, length: int) -> None:
-        self.add_uint32(Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length)
+        self.add_uint32(
+            Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length
+        )

    def add_feed_forward_length(self, length: int | Sequence[int]) -> None:
        if isinstance(length, int):
@ -614,10 +741,14 @@ def add_feed_forward_length(self, length: int | Sequence[int]) -> None:
            self.add_array(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)

    def add_expert_feed_forward_length(self, length: int) -> None:
-        self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+        self.add_uint32(
+            Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length
+        )

    def add_expert_shared_feed_forward_length(self, length: int) -> None:
-        self.add_uint32(Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+        self.add_uint32(
+            Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length
+        )

    def add_parallel_residual(self, use: bool) -> None:
        self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
@ -643,6 +774,12 @@ def add_key_length(self, length: int) -> None:
    def add_value_length(self, length: int) -> None:
        self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length)

+    def add_key_length_mla(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.KEY_LENGTH_MLA.format(arch=self.arch), length)
+
+    def add_value_length_mla(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length)
+
    def add_max_alibi_bias(self, bias: float) -> None:
        self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)

@ -670,12 +807,56 @@ def add_expert_shared_count(self, count: int) -> None:
    def add_expert_weights_scale(self, value: float) -> None:
        self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)

+    def add_expert_weights_norm(self, value: bool) -> None:
+        self.add_bool(Keys.LLM.EXPERT_WEIGHTS_NORM.format(arch=self.arch), value)
+
+    def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
+
+    def add_moe_every_n_layers(self, value: int) -> None:
+        self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)
+
+    def add_swin_norm(self, value: bool) -> None:
+        self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
+
+    def add_rescale_every_n_layers(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
+
+    def add_time_mix_extra_dim(self, dim: int) -> None:
+        self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim)
+
+    def add_time_decay_extra_dim(self, dim: int) -> None:
+        self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
+
+    def add_residual_scale(self, value: float) -> None:
+        self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value)
+
+    def add_embedding_scale(self, value: float) -> None:
+        self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value)
+
+    def add_wkv_head_size(self, size: int) -> None:
+        self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
+
+    def add_token_shift_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count)
+
+    def add_interleave_moe_layer_step(self, value: int) -> None:
+        self.add_uint32(
+            Keys.LLM.INTERLEAVE_MOE_LAYER_STEP.format(arch=self.arch), value
+        )
+
    def add_layer_norm_eps(self, value: float) -> None:
        self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)

    def add_layer_norm_rms_eps(self, value: float) -> None:
        self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)

+    def add_group_norm_eps(self, value: float) -> None:
+        self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value)
+
+    def add_group_norm_groups(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.GROUPNORM_GROUPS.format(arch=self.arch), value)
+
    def add_causal_attention(self, value: bool) -> None:
        self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)

@ -685,18 +866,38 @@ def add_q_lora_rank(self, length: int) -> None:
    def add_kv_lora_rank(self, length: int) -> None:
        self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)

+    def add_decay_lora_rank(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.DECAY_LORA_RANK.format(arch=self.arch), length)
+
+    def add_iclr_lora_rank(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.ICLR_LORA_RANK.format(arch=self.arch), length)
+
+    def add_value_residual_mix_lora_rank(self, length: int) -> None:
+        self.add_uint32(
+            Keys.Attention.VALUE_RESIDUAL_MIX_LORA_RANK.format(arch=self.arch), length
+        )
+
+    def add_gate_lora_rank(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.GATE_LORA_RANK.format(arch=self.arch), length)
+
    def add_relative_attn_buckets_count(self, value: int) -> None:
        self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)

    def add_sliding_window(self, value: int) -> None:
        self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)

+    def add_attention_scale(self, value: float) -> None:
+        self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)
+
    def add_pooling_type(self, value: PoolingType) -> None:
        self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)

    def add_rope_dimension_count(self, count: int) -> None:
        self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)

+    def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
+        self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)
+
    def add_rope_freq_base(self, value: float) -> None:
        self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)

@ -730,16 +931,23 @@ def add_ssm_state_size(self, value: int) -> None:
    def add_ssm_time_step_rank(self, value: int) -> None:
        self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)

+    def add_ssm_dt_b_c_rms(self, value: bool) -> None:
+        self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
+
    def add_tokenizer_model(self, model: str) -> None:
        self.add_string(Keys.Tokenizer.MODEL, model)

    def add_tokenizer_pre(self, pre: str) -> None:
        self.add_string(Keys.Tokenizer.PRE, pre)

-    def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
+    def add_token_list(
+        self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]
+    ) -> None:
        self.add_array(Keys.Tokenizer.LIST, tokens)

-    def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
+    def add_token_merges(
+        self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]
+    ) -> None:
        self.add_array(Keys.Tokenizer.MERGES, merges)

    def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None:
@ -766,9 +974,6 @@ def add_sep_token_id(self, id: int) -> None:
    def add_pad_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.PAD_ID, id)

-    def add_cls_token_id(self, id: int) -> None:
-        self.add_uint32(Keys.Tokenizer.CLS_ID, id)
-
    def add_mask_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.MASK_ID, id)

@ -793,18 +998,22 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
            template_names = set()

            for choice in value:
-                name = choice.get('name', '')
-                template = choice.get('template')
+                name = choice.get("name", "")
+                template = choice.get("template")

                # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
-                name = ''.join((c if c in ascii_letters + digits else '_' for c in name))
+                name = "".join(
+                    (c if c in ascii_letters + digits else "_" for c in name)
+                )

                if name and template is not None:
-                    if name == 'default':
+                    if name == "default":
                        template_default = template
                    else:
                        template_names.add(name)
-                        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template)
+                        self.add_string(
+                            Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template
+                        )

            if template_names:
                self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names))
@ -816,23 +1025,70 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:

        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)

-    def add_prefix_token_id(self, id: int) -> None:
-        self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
-
-    def add_suffix_token_id(self, id: int) -> None:
-        self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
-
-    def add_middle_token_id(self, id: int) -> None:
-        self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
-
    def add_eot_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.EOT_ID, id)

+    def add_eom_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.EOM_ID, id)
+
+    # for vision models
+
+    def add_vision_projection_dim(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)
+
+    def add_vision_has_vision_encoder(self, value: bool) -> None:
+        self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value)
+
+    def add_vision_patch_size(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)
+
+    def add_vision_embedding_length(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.EMBEDDING_LENGTH, value)
+
+    def add_vision_feed_forward_length(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.FEED_FORWARD_LENGTH, value)
+
+    def add_vision_block_count(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.BLOCK_COUNT, value)
+
+    def add_vision_head_count(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value)
+
+    def add_vision_projector_type(self, value: str) -> None:
+        self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
+
+    def add_vision_attention_layernorm_eps(self, value: float) -> None:
+        self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)
+
+    def add_vision_image_size(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
+
+    def add_vision_image_mean(self, values: Sequence[float]) -> None:
+        self.add_array(Keys.ClipVision.IMAGE_MEAN, values)
+
+    def add_vision_image_std(self, values: Sequence[float]) -> None:
+        self.add_array(Keys.ClipVision.IMAGE_STD, values)
+
+    def add_vision_spatial_merge_size(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.SPATIAL_MERGE_SIZE, value)
+
+    def add_vision_use_gelu(self, value: bool) -> None:
+        self.add_bool(Keys.ClipVision.USE_GELU, value)
+
+    def add_vision_use_silu(self, value: bool) -> None:
+        self.add_bool(Keys.ClipVision.USE_SILU, value)
+
+    def add_vision_projector_scale_factor(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
+
+    def add_vision_n_wa_pattern(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
+
    def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
-        pack_prefix = ''
+        pack_prefix = ""
        if not skip_pack_prefix:
-            pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>'
-        return struct.pack(f'{pack_prefix}{fmt}', value)
+            pack_prefix = "<" if self.endianess == GGUFEndian.LITTLE else ">"
+        return struct.pack(f"{pack_prefix}{fmt}", value)

    def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:
        kv_data = bytearray()
@ -842,7 +1098,9 @@ def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:

        pack_fmt = self._simple_value_packing.get(vtype)
        if pack_fmt is not None:
-            kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
+            kv_data += self._pack(
+                pack_fmt, val, skip_pack_prefix=vtype == GGUFValueType.BOOL
+            )
        elif vtype == GGUFValueType.STRING:
            encoded_val = val.encode("utf-8") if isinstance(val, str) else val
            kv_data += self._pack("Q", len(encoded_val))
@ -860,7 +1118,9 @@ def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:
            else:
                ltype = GGUFValueType.get_type(val[0])
                if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
-                    raise ValueError("All items in a GGUF array should be of the same type")
+                    raise ValueError(
+                        "All items in a GGUF array should be of the same type"
+                    )
            kv_data += self._pack("I", ltype)
            kv_data += self._pack("Q", len(val))
            for item in val:
--- a/src/gguf-py/gguf/lazy.py
+++ b/src/gguf-py/gguf/lazy.py
@ -13,7 +13,9 @@

 class LazyMeta(ABCMeta):

-    def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs):
+    def __new__(
+        cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs
+    ):
        def __getattr__(self, name: str) -> Any:
            meta_attr = getattr(self._meta, name)
            if callable(meta_attr):
@ -41,6 +43,7 @@ def wrapped_special_op(self, *args, **kwargs):
                    getattr(type(self)._tensor_type, op_name),
                    meta_noop=meta_noop,
                )(self, *args, **kwargs)
+
            return wrapped_special_op

        # special methods bypass __getattr__, so they need to be added manually
@ -48,11 +51,48 @@ def wrapped_special_op(self, *args, **kwargs):
        # NOTE: doing this from a metaclass is very convenient
        # TODO: make this even more comprehensive
        for binary_op in (
-            "lt", "le", "eq", "ne", "ge", "gt", "not"
-            "abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul",
-            "neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor",
-            "iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor",
-            "radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor",
+            "lt",
+            "le",
+            "eq",
+            "ne",
+            "ge",
+            "gt",
+            "not" "abs",
+            "add",
+            "and",
+            "floordiv",
+            "invert",
+            "lshift",
+            "mod",
+            "mul",
+            "matmul",
+            "neg",
+            "or",
+            "pos",
+            "pow",
+            "rshift",
+            "sub",
+            "truediv",
+            "xor",
+            "iadd",
+            "iand",
+            "ifloordiv",
+            "ilshift",
+            "imod",
+            "imul",
+            "ior",
+            "irshift",
+            "isub",
+            "ixor",
+            "radd",
+            "rand",
+            "rfloordiv",
+            "rmul",
+            "ror",
+            "rpow",
+            "rsub",
+            "rtruediv",
+            "rxor",
        ):
            attr_name = f"__{binary_op}__"
            # the result of these operators usually has the same shape and dtype as the input,
@ -60,7 +100,9 @@ def wrapped_special_op(self, *args, **kwargs):
            namespace[attr_name] = mk_wrap(attr_name, meta_noop=True)

        for special_op in (
-            "getitem", "setitem", "len",
+            "getitem",
+            "setitem",
+            "len",
        ):
            attr_name = f"__{special_op}__"
            namespace[attr_name] = mk_wrap(attr_name, meta_noop=False)
@ -77,7 +119,15 @@ class LazyBase(ABC, metaclass=LazyMeta):
    _kwargs: dict[str, Any]
    _func: Callable[[Any], Any] | None

-    def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None):
+    def __init__(
+        self,
+        *,
+        meta: Any,
+        data: Any | None = None,
+        args: tuple = (),
+        kwargs: dict[str, Any] | None = None,
+        func: Callable[[Any], Any] | None = None,
+    ):
        super().__init__()
        self._meta = meta
        self._data = data
@ -107,7 +157,17 @@ def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
            return o

    @classmethod
-    def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
+    def _wrap_fn(
+        cls,
+        fn: Callable,
+        *,
+        use_self: LazyBase | None = None,
+        meta_noop: (
+            bool
+            | DTypeLike
+            | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]]
+        ) = False,
+    ) -> Callable[[Any], Any]:
        def wrapped_fn(*args, **kwargs):
            if kwargs is None:
                kwargs = {}
@ -138,13 +198,37 @@ def wrapped_fn(*args, **kwargs):
                        res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)

            if isinstance(res, cls._tensor_type):
-                return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn)
+                return cls(
+                    meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn
+                )
+            elif isinstance(res, tuple) and all(
+                isinstance(t, cls._tensor_type) for t in res
+            ):
+                # share the evaluation between lazy tuple elements
+                shared_args: list = [args, None]
+
+                def eager_tuple_element(a: list[Any], i: int = 0, /, **kw) -> LazyBase:
+                    assert len(a) == 2
+                    if a[1] is None:
+                        a[1] = fn(*a[0], **kw)
+                    return a[1][i]
+
+                return tuple(
+                    cls(
+                        meta=cls.eager_to_meta(res[i]),
+                        args=(shared_args, i),
+                        kwargs=kwargs,
+                        func=eager_tuple_element,
+                    )
+                    for i in range(len(res))
+                )
            else:
                del res  # not needed
                # non-tensor return likely relies on the contents of the args
                # (e.g. the result of torch.equal)
                eager_args = cls.to_eager(args)
                return fn(*eager_args, **kwargs)
+
        return wrapped_fn

    @classmethod
@ -175,7 +259,8 @@ def eager_to_meta(cls, t: Any) -> Any:
    # must be overridden, meta tensor init is backend-specific
    @classmethod
    @abstractmethod
-    def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: pass
+    def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any:
+        pass

    @classmethod
    def from_eager(cls, t: Any) -> Any:
@ -191,8 +276,12 @@ def from_eager(cls, t: Any) -> Any:
 class LazyNumpyTensor(LazyBase):
    _tensor_type = np.ndarray

+    shape: tuple[int, ...]  # Makes the type checker happy in quants.py
+
    @classmethod
-    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
+    def meta_with_dtype_and_shape(
+        cls, dtype: DTypeLike, shape: tuple[int, ...]
+    ) -> np.ndarray[Any, Any]:
        # The initial idea was to use np.nan as the fill value,
        # but non-float types like np.int16 can't use that.
        # So zero it is.
@ -201,8 +290,16 @@ def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) ->

    def astype(self, dtype, *args, **kwargs):
        meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
-        full_args = (self, dtype,) + args
-        return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)))
+        full_args = (
+            self,
+            dtype,
+        ) + args
+        return type(self)(
+            meta=meta,
+            args=full_args,
+            kwargs=kwargs,
+            func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)),
+        )

    def tofile(self, *args, **kwargs):
        eager = LazyNumpyTensor.to_eager(self)
--- a/src/gguf/metadata.py
+++ b/src/gguf/metadata.py
@ -0,0 +1,863 @@
+from __future__ import annotations
+
+import re
+import json
+import yaml
+import logging
+from pathlib import Path
+from typing import Any, Literal, Optional
+from dataclasses import dataclass
+
+from .constants import Keys
+
+import gguf
+
+logger = logging.getLogger("metadata")
+
+
+@dataclass
+class Metadata:
+    # Authorship Metadata to be written to GGUF KV Store
+    name: Optional[str] = None
+    author: Optional[str] = None
+    version: Optional[str] = None
+    organization: Optional[str] = None
+    finetune: Optional[str] = None
+    basename: Optional[str] = None
+    description: Optional[str] = None
+    quantized_by: Optional[str] = None
+    size_label: Optional[str] = None
+    url: Optional[str] = None
+    doi: Optional[str] = None
+    uuid: Optional[str] = None
+    repo_url: Optional[str] = None
+    source_url: Optional[str] = None
+    source_doi: Optional[str] = None
+    source_uuid: Optional[str] = None
+    source_repo_url: Optional[str] = None
+    license: Optional[str] = None
+    license_name: Optional[str] = None
+    license_link: Optional[str] = None
+    base_models: Optional[list[dict]] = None
+    tags: Optional[list[str]] = None
+    languages: Optional[list[str]] = None
+    datasets: Optional[list[dict]] = None
+
+    @staticmethod
+    def load(
+        metadata_override_path: Optional[Path] = None,
+        model_path: Optional[Path] = None,
+        model_name: Optional[str] = None,
+        total_params: int = 0,
+    ) -> Metadata:
+        # This grabs as many contextual authorship metadata as possible from the model repository
+        # making any conversion as required to match the gguf kv store metadata format
+        # as well as giving users the ability to override any authorship metadata that may be incorrect
+
+        # Create a new Metadata instance
+        metadata = Metadata()
+
+        model_card = Metadata.load_model_card(model_path)
+        hf_params = Metadata.load_hf_parameters(model_path)
+        # TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
+
+        # heuristics
+        metadata = Metadata.apply_metadata_heuristic(
+            metadata, model_card, hf_params, model_path, total_params
+        )
+
+        # Metadata Override File Provided
+        # This is based on LLM_KV_NAMES mapping in llama.cpp
+        metadata_override = Metadata.load_metadata_override(metadata_override_path)
+
+        metadata.name = metadata_override.get(Keys.General.NAME, metadata.name)
+        metadata.author = metadata_override.get(Keys.General.AUTHOR, metadata.author)
+        metadata.version = metadata_override.get(Keys.General.VERSION, metadata.version)
+        metadata.organization = metadata_override.get(
+            Keys.General.ORGANIZATION, metadata.organization
+        )
+
+        metadata.finetune = metadata_override.get(
+            Keys.General.FINETUNE, metadata.finetune
+        )
+        metadata.basename = metadata_override.get(
+            Keys.General.BASENAME, metadata.basename
+        )
+
+        metadata.description = metadata_override.get(
+            Keys.General.DESCRIPTION, metadata.description
+        )
+        metadata.quantized_by = metadata_override.get(
+            Keys.General.QUANTIZED_BY, metadata.quantized_by
+        )
+
+        metadata.size_label = metadata_override.get(
+            Keys.General.SIZE_LABEL, metadata.size_label
+        )
+        metadata.license_name = metadata_override.get(
+            Keys.General.LICENSE_NAME, metadata.license_name
+        )
+        metadata.license_link = metadata_override.get(
+            Keys.General.LICENSE_LINK, metadata.license_link
+        )
+
+        metadata.url = metadata_override.get(Keys.General.URL, metadata.url)
+        metadata.doi = metadata_override.get(Keys.General.DOI, metadata.doi)
+        metadata.uuid = metadata_override.get(Keys.General.UUID, metadata.uuid)
+        metadata.repo_url = metadata_override.get(
+            Keys.General.REPO_URL, metadata.repo_url
+        )
+
+        metadata.source_url = metadata_override.get(
+            Keys.General.SOURCE_URL, metadata.source_url
+        )
+        metadata.source_doi = metadata_override.get(
+            Keys.General.SOURCE_DOI, metadata.source_doi
+        )
+        metadata.source_uuid = metadata_override.get(
+            Keys.General.SOURCE_UUID, metadata.source_uuid
+        )
+        metadata.source_repo_url = metadata_override.get(
+            Keys.General.SOURCE_REPO_URL, metadata.source_repo_url
+        )
+
+        # Base Models is received here as an array of models
+        metadata.base_models = metadata_override.get(
+            "general.base_models", metadata.base_models
+        )
+
+        # Datasets is received here as an array of datasets
+        metadata.datasets = metadata_override.get("general.datasets", metadata.datasets)
+
+        metadata.tags = metadata_override.get(Keys.General.TAGS, metadata.tags)
+        metadata.languages = metadata_override.get(
+            Keys.General.LANGUAGES, metadata.languages
+        )
+
+        # Direct Metadata Override (via direct cli argument)
+        if model_name is not None:
+            metadata.name = model_name
+
+        return metadata
+
+    @staticmethod
+    def load_metadata_override(
+        metadata_override_path: Optional[Path] = None,
+    ) -> dict[str, Any]:
+        if metadata_override_path is None or not metadata_override_path.is_file():
+            return {}
+
+        with open(metadata_override_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+
+    @staticmethod
+    def load_model_card(model_path: Optional[Path] = None) -> dict[str, Any]:
+        if model_path is None or not model_path.is_dir():
+            return {}
+
+        model_card_path = model_path / "README.md"
+
+        if not model_card_path.is_file():
+            return {}
+
+        # The model card metadata is assumed to always be in YAML (frontmatter)
+        # ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473
+        yaml_content: str = ""
+        with open(model_card_path, "r", encoding="utf-8") as f:
+            content = f.read()
+            lines = content.splitlines()
+            lines_yaml = []
+            if len(lines) == 0:
+                # Empty file
+                return {}
+            if len(lines) > 0 and lines[0] != "---":
+                # No frontmatter
+                return {}
+            for line in lines[1:]:
+                if line == "---":
+                    break  # End of frontmatter
+                else:
+                    lines_yaml.append(line)
+            yaml_content = "\n".join(lines_yaml) + "\n"
+
+        # Quick hack to fix the Norway problem
+        # https://hitchdev.com/strictyaml/why/implicit-typing-removed/
+        yaml_content = yaml_content.replace("- no\n", '- "no"\n')
+
+        if yaml_content:
+            data = yaml.safe_load(yaml_content)
+            if isinstance(data, dict):
+                return data
+            else:
+                logger.error(
+                    f"while reading YAML model card frontmatter, data is {type(data)} instead of dict"
+                )
+                return {}
+        else:
+            return {}
+
+    @staticmethod
+    def load_hf_parameters(model_path: Optional[Path] = None) -> dict[str, Any]:
+        if model_path is None or not model_path.is_dir():
+            return {}
+
+        config_path = model_path / "config.json"
+
+        if not config_path.is_file():
+            return {}
+
+        with open(config_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+
+    @staticmethod
+    def id_to_title(string):
+        # Convert capitalization into title form unless acronym or version number
+        return " ".join(
+            [
+                (
+                    w.title()
+                    if w.islower() and not re.match(r"^(v\d+(?:\.\d+)*|\d.*)$", w)
+                    else w
+                )
+                for w in string.strip().replace("-", " ").split()
+            ]
+        )
+
+    @staticmethod
+    def get_model_id_components(
+        model_id: Optional[str] = None, total_params: int = 0
+    ) -> tuple[str | None, str | None, str | None, str | None, str | None, str | None]:
+        # Huggingface often store model id as '<org>/<model name>'
+        # so let's parse it and apply some heuristics if possible for model name components
+
+        if model_id is None:
+            # model ID missing
+            return None, None, None, None, None, None
+
+        if " " in model_id:
+            # model ID is actually a normal human sentence
+            # which means its most likely a normal model name only
+            # not part of the hugging face naming standard, but whatever
+            return model_id, None, None, None, None, None
+
+        if "/" in model_id:
+            # model ID (huggingface style)
+            org_component, model_full_name_component = model_id.split("/", 1)
+        else:
+            # model ID but missing org components
+            org_component, model_full_name_component = None, model_id
+
+        # Check if we erroneously matched against './' or '../' etc...
+        if (
+            org_component is not None
+            and len(org_component) > 0
+            and org_component[0] == "."
+        ):
+            org_component = None
+
+        name_parts: list[str] = model_full_name_component.split("-")
+
+        # Remove empty parts
+        for i in reversed(range(len(name_parts))):
+            if len(name_parts[i]) == 0:
+                del name_parts[i]
+
+        name_types: list[
+            set[Literal["basename", "size_label", "finetune", "version", "type"]]
+        ] = [set() for _ in name_parts]
+
+        # Annotate the name
+        for i, part in enumerate(name_parts):
+            # Version
+            if re.fullmatch(r"(v|iter)?\d+([.]\d+)*", part, re.IGNORECASE):
+                name_types[i].add("version")
+            # Quant type (should not be there for base models, but still annotated)
+            elif re.fullmatch(r"i?q\d(_\w)*|b?fp?(16|32)", part, re.IGNORECASE):
+                name_types[i].add("type")
+                name_parts[i] = part.upper()
+            # Model size
+            elif i > 0 and re.fullmatch(
+                r"(([A]|\d+[x])?\d+([._]\d+)?[KMBT][\d]?|small|mini|medium|large|x?xl)",
+                part,
+                re.IGNORECASE,
+            ):
+                part = part.replace("_", ".")
+                # Handle weird bloom-7b1 notation
+                if part[-1].isdecimal():
+                    part = part[:-2] + "." + part[-1] + part[-2]
+                # Normalize the size suffixes
+                if len(part) > 1 and part[-2].isdecimal():
+                    if part[-1] in "kmbt":
+                        part = part[:-1] + part[-1].upper()
+                if total_params != 0:
+                    try:
+                        label_params = float(part[:-1]) * pow(
+                            1000, " KMBT".find(part[-1])
+                        )
+                        # Only use it as a size label if it's close or bigger than the model size
+                        # Note that LoRA adapters don't necessarily include all layers,
+                        # so this is why bigger label sizes are accepted.
+                        # Do not use the size label when it's smaller than 1/8 of the model size
+                        if (
+                            total_params < 0 and label_params < abs(total_params) // 8
+                        ) or (
+                            # Check both directions when the current model isn't a LoRA adapter
+                            total_params > 0
+                            and abs(label_params - total_params) > 7 * total_params // 8
+                        ):
+                            # Likely a context length
+                            name_types[i].add("finetune")
+                            # Lowercase the size when it's a context length
+                            part = part[:-1] + part[-1].lower()
+                    except ValueError:
+                        # Failed to convert the size label to float, use it anyway
+                        pass
+                if len(name_types[i]) == 0:
+                    name_types[i].add("size_label")
+                name_parts[i] = part
+            # Some easy to recognize finetune names
+            elif i > 0 and re.fullmatch(
+                r"chat|instruct|vision|lora", part, re.IGNORECASE
+            ):
+                if total_params < 0 and part.lower() == "lora":
+                    # ignore redundant "lora" in the finetune part when the output is a lora adapter
+                    name_types[i].add("type")
+                else:
+                    name_types[i].add("finetune")
+
+        # Ignore word-based size labels when there is at least a number-based one present
+        # TODO: should word-based size labels always be removed instead?
+        if any(
+            c.isdecimal()
+            for n, t in zip(name_parts, name_types)
+            if "size_label" in t
+            for c in n
+        ):
+            for n, t in zip(name_parts, name_types):
+                if "size_label" in t:
+                    if all(c.isalpha() for c in n):
+                        t.remove("size_label")
+
+        at_start = True
+        # Find the basename through the annotated name
+        for part, t in zip(name_parts, name_types):
+            if at_start and ((len(t) == 0 and part[0].isalpha()) or "version" in t):
+                t.add("basename")
+            else:
+                if at_start:
+                    at_start = False
+                if len(t) == 0:
+                    t.add("finetune")
+
+        # Remove the basename annotation from trailing version
+        for part, t in zip(reversed(name_parts), reversed(name_types)):
+            if "basename" in t and len(t) > 1:
+                t.remove("basename")
+            else:
+                break
+
+        basename = (
+            "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t)
+            or None
+        )
+        # Deduplicate size labels using order-preserving 'dict' ('set' seems to sort the keys)
+        size_label = (
+            "-".join(
+                dict.fromkeys(
+                    s for s, t in zip(name_parts, name_types) if "size_label" in t
+                ).keys()
+            )
+            or None
+        )
+        finetune = (
+            "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t)
+            or None
+        )
+        # TODO: should the basename version always be excluded?
+        # NOTE: multiple finetune versions are joined together
+        version = (
+            "-".join(
+                v
+                for v, t, in zip(name_parts, name_types)
+                if "version" in t and "basename" not in t
+            )
+            or None
+        )
+
+        if size_label is None and finetune is None and version is None:
+            # Too ambiguous, output nothing
+            basename = None
+
+        return (
+            model_full_name_component,
+            org_component,
+            basename,
+            finetune,
+            version,
+            size_label,
+        )
+
+    @staticmethod
+    def apply_metadata_heuristic(
+        metadata: Metadata,
+        model_card: Optional[dict] = None,
+        hf_params: Optional[dict] = None,
+        model_path: Optional[Path] = None,
+        total_params: int = 0,
+    ) -> Metadata:
+        # Reference Model Card Metadata: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
+
+        # Model Card Heuristics
+        ########################
+        if model_card is not None:
+
+            def use_model_card_metadata(metadata_key: str, model_card_key: str):
+                if (
+                    model_card_key in model_card
+                    and getattr(metadata, metadata_key, None) is None
+                ):
+                    setattr(metadata, metadata_key, model_card.get(model_card_key))
+
+            def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
+                # Note: Will append rather than replace if already exist
+                tags_value = model_card.get(model_card_key, None)
+                if tags_value is None:
+                    return
+
+                current_value = getattr(metadata, metadata_key, None)
+                if current_value is None:
+                    current_value = []
+
+                if isinstance(tags_value, str):
+                    current_value.append(tags_value)
+                elif isinstance(tags_value, list):
+                    current_value.extend(tags_value)
+
+                setattr(metadata, metadata_key, current_value)
+
+            # LLAMA.cpp's direct internal convention
+            # (Definitely not part of hugging face formal/informal standard)
+            #########################################
+            use_model_card_metadata("name", "name")
+            use_model_card_metadata("author", "author")
+            use_model_card_metadata("version", "version")
+            use_model_card_metadata("organization", "organization")
+            use_model_card_metadata("description", "description")
+            use_model_card_metadata("finetune", "finetune")
+            use_model_card_metadata("basename", "basename")
+            use_model_card_metadata("size_label", "size_label")
+            use_model_card_metadata("source_url", "url")
+            use_model_card_metadata("source_doi", "doi")
+            use_model_card_metadata("source_uuid", "uuid")
+            use_model_card_metadata("source_repo_url", "repo_url")
+
+            # LLAMA.cpp's huggingface style convention
+            # (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style)
+            ###########################################
+            use_model_card_metadata("name", "model_name")
+            use_model_card_metadata("author", "model_author")
+            use_model_card_metadata("version", "model_version")
+            use_model_card_metadata("organization", "model_organization")
+            use_model_card_metadata("description", "model_description")
+            use_model_card_metadata("finetune", "model_finetune")
+            use_model_card_metadata("basename", "model_basename")
+            use_model_card_metadata("size_label", "model_size_label")
+            use_model_card_metadata("source_url", "model_url")
+            use_model_card_metadata("source_doi", "model_doi")
+            use_model_card_metadata("source_uuid", "model_uuid")
+            use_model_card_metadata("source_repo_url", "model_repo_url")
+
+            # Hugging Face Direct Convention
+            #################################
+
+            # Not part of huggingface model card standard but notice some model creator using it
+            # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
+            use_model_card_metadata("name", "model_name")
+            use_model_card_metadata("author", "model_creator")
+            use_model_card_metadata("basename", "model_type")
+
+            if (
+                "base_model" in model_card
+                or "base_models" in model_card
+                or "base_model_sources" in model_card
+            ):
+                # This represents the parent models that this is based on
+                # Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges)
+                # Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md
+                metadata_base_models = []
+                base_model_value = model_card.get(
+                    "base_model",
+                    model_card.get(
+                        "base_models", model_card.get("base_model_sources", None)
+                    ),
+                )
+
+                if base_model_value is not None:
+                    if isinstance(base_model_value, str):
+                        metadata_base_models.append(base_model_value)
+                    elif isinstance(base_model_value, list):
+                        metadata_base_models.extend(base_model_value)
+
+                if metadata.base_models is None:
+                    metadata.base_models = []
+
+                for model_id in metadata_base_models:
+                    # NOTE: model size of base model is assumed to be similar to the size of the current model
+                    base_model = {}
+                    if isinstance(model_id, str):
+                        if (
+                            model_id.startswith("http://")
+                            or model_id.startswith("https://")
+                            or model_id.startswith("ssh://")
+                        ):
+                            base_model["repo_url"] = model_id
+
+                            # Check if Hugging Face ID is present in URL
+                            if "huggingface.co" in model_id:
+                                match = re.match(
+                                    r"https?://huggingface.co/([^/]+/[^/]+)$", model_id
+                                )
+                                if match:
+                                    model_id_component = match.group(1)
+                                    (
+                                        model_full_name_component,
+                                        org_component,
+                                        basename,
+                                        finetune,
+                                        version,
+                                        size_label,
+                                    ) = Metadata.get_model_id_components(
+                                        model_id_component, total_params
+                                    )
+
+                                    # Populate model dictionary with extracted components
+                                    if model_full_name_component is not None:
+                                        base_model["name"] = Metadata.id_to_title(
+                                            model_full_name_component
+                                        )
+                                    if org_component is not None:
+                                        base_model["organization"] = (
+                                            Metadata.id_to_title(org_component)
+                                        )
+                                    if version is not None:
+                                        base_model["version"] = version
+
+                        else:
+                            # Likely a Hugging Face ID
+                            (
+                                model_full_name_component,
+                                org_component,
+                                basename,
+                                finetune,
+                                version,
+                                size_label,
+                            ) = Metadata.get_model_id_components(model_id, total_params)
+
+                            # Populate model dictionary with extracted components
+                            if model_full_name_component is not None:
+                                base_model["name"] = Metadata.id_to_title(
+                                    model_full_name_component
+                                )
+                            if org_component is not None:
+                                base_model["organization"] = Metadata.id_to_title(
+                                    org_component
+                                )
+                            if version is not None:
+                                base_model["version"] = version
+                            if (
+                                org_component is not None
+                                and model_full_name_component is not None
+                            ):
+                                base_model["repo_url"] = (
+                                    f"https://huggingface.co/{org_component}/{model_full_name_component}"
+                                )
+
+                    elif isinstance(model_id, dict):
+                        base_model = model_id
+
+                    else:
+                        logger.error(
+                            f"base model entry '{str(model_id)}' not in a known format"
+                        )
+
+                    metadata.base_models.append(base_model)
+
+            if (
+                "datasets" in model_card
+                or "dataset" in model_card
+                or "dataset_sources" in model_card
+            ):
+                # This represents the datasets that this was trained from
+                metadata_datasets = []
+                dataset_value = model_card.get(
+                    "datasets",
+                    model_card.get("dataset", model_card.get("dataset_sources", None)),
+                )
+
+                if dataset_value is not None:
+                    if isinstance(dataset_value, str):
+                        metadata_datasets.append(dataset_value)
+                    elif isinstance(dataset_value, list):
+                        metadata_datasets.extend(dataset_value)
+
+                if metadata.datasets is None:
+                    metadata.datasets = []
+
+                for dataset_id in metadata_datasets:
+                    # NOTE: model size of base model is assumed to be similar to the size of the current model
+                    dataset = {}
+                    if isinstance(dataset_id, str):
+                        if dataset_id.startswith(("http://", "https://", "ssh://")):
+                            dataset["repo_url"] = dataset_id
+
+                            # Check if Hugging Face ID is present in URL
+                            if "huggingface.co" in dataset_id:
+                                match = re.match(
+                                    r"https?://huggingface.co/([^/]+/[^/]+)$",
+                                    dataset_id,
+                                )
+                                if match:
+                                    dataset_id_component = match.group(1)
+                                    (
+                                        dataset_name_component,
+                                        org_component,
+                                        basename,
+                                        finetune,
+                                        version,
+                                        size_label,
+                                    ) = Metadata.get_model_id_components(
+                                        dataset_id_component, total_params
+                                    )
+
+                                    # Populate dataset dictionary with extracted components
+                                    if dataset_name_component is not None:
+                                        dataset["name"] = Metadata.id_to_title(
+                                            dataset_name_component
+                                        )
+                                    if org_component is not None:
+                                        dataset["organization"] = Metadata.id_to_title(
+                                            org_component
+                                        )
+                                    if version is not None:
+                                        dataset["version"] = version
+
+                        else:
+                            # Likely a Hugging Face ID
+                            (
+                                dataset_name_component,
+                                org_component,
+                                basename,
+                                finetune,
+                                version,
+                                size_label,
+                            ) = Metadata.get_model_id_components(
+                                dataset_id, total_params
+                            )
+
+                            # Populate dataset dictionary with extracted components
+                            if dataset_name_component is not None:
+                                dataset["name"] = Metadata.id_to_title(
+                                    dataset_name_component
+                                )
+                            if org_component is not None:
+                                dataset["organization"] = Metadata.id_to_title(
+                                    org_component
+                                )
+                            if version is not None:
+                                dataset["version"] = version
+                            if (
+                                org_component is not None
+                                and dataset_name_component is not None
+                            ):
+                                dataset["repo_url"] = (
+                                    f"https://huggingface.co/{org_component}/{dataset_name_component}"
+                                )
+
+                    elif isinstance(dataset_id, dict):
+                        dataset = dataset_id
+
+                    else:
+                        logger.error(
+                            f"dataset entry '{str(dataset_id)}' not in a known format"
+                        )
+
+                    metadata.datasets.append(dataset)
+
+            use_model_card_metadata("license", "license")
+            use_model_card_metadata("license_name", "license_name")
+            use_model_card_metadata("license_link", "license_link")
+
+            use_array_model_card_metadata("tags", "tags")
+            use_array_model_card_metadata("tags", "pipeline_tag")
+
+            use_array_model_card_metadata("languages", "languages")
+            use_array_model_card_metadata("languages", "language")
+
+        # Hugging Face Parameter Heuristics
+        ####################################
+
+        if hf_params is not None:
+
+            hf_name_or_path = hf_params.get("_name_or_path")
+            if hf_name_or_path is not None and hf_name_or_path.count("/") <= 1:
+                # Use _name_or_path only if its actually a model name and not some computer path
+                # e.g. 'meta-llama/Llama-2-7b-hf'
+                model_id = hf_name_or_path
+                (
+                    model_full_name_component,
+                    org_component,
+                    basename,
+                    finetune,
+                    version,
+                    size_label,
+                ) = Metadata.get_model_id_components(model_id, total_params)
+                if metadata.name is None and model_full_name_component is not None:
+                    metadata.name = Metadata.id_to_title(model_full_name_component)
+                if metadata.organization is None and org_component is not None:
+                    metadata.organization = Metadata.id_to_title(org_component)
+                if metadata.basename is None and basename is not None:
+                    metadata.basename = basename
+                if metadata.finetune is None and finetune is not None:
+                    metadata.finetune = finetune
+                if metadata.version is None and version is not None:
+                    metadata.version = version
+                if metadata.size_label is None and size_label is not None:
+                    metadata.size_label = size_label
+
+        # Directory Folder Name Fallback Heuristics
+        ############################################
+        if model_path is not None:
+            model_id = model_path.name
+            (
+                model_full_name_component,
+                org_component,
+                basename,
+                finetune,
+                version,
+                size_label,
+            ) = Metadata.get_model_id_components(model_id, total_params)
+            if metadata.name is None and model_full_name_component is not None:
+                metadata.name = Metadata.id_to_title(model_full_name_component)
+            if metadata.organization is None and org_component is not None:
+                metadata.organization = Metadata.id_to_title(org_component)
+            if metadata.basename is None and basename is not None:
+                metadata.basename = basename
+            if metadata.finetune is None and finetune is not None:
+                metadata.finetune = finetune
+            if metadata.version is None and version is not None:
+                metadata.version = version
+            if metadata.size_label is None and size_label is not None:
+                metadata.size_label = size_label
+
+        return metadata
+
+    def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
+        assert self.name is not None
+        gguf_writer.add_name(self.name)
+
+        if self.author is not None:
+            gguf_writer.add_author(self.author)
+        if self.version is not None:
+            gguf_writer.add_version(self.version)
+        if self.organization is not None:
+            gguf_writer.add_organization(self.organization)
+
+        if self.finetune is not None:
+            gguf_writer.add_finetune(self.finetune)
+        if self.basename is not None:
+            gguf_writer.add_basename(self.basename)
+
+        if self.description is not None:
+            gguf_writer.add_description(self.description)
+        if self.quantized_by is not None:
+            gguf_writer.add_quantized_by(self.quantized_by)
+
+        if self.size_label is not None:
+            gguf_writer.add_size_label(self.size_label)
+
+        if self.license is not None:
+            if isinstance(self.license, list):
+                gguf_writer.add_license(",".join(self.license))
+            else:
+                gguf_writer.add_license(self.license)
+        if self.license_name is not None:
+            gguf_writer.add_license_name(self.license_name)
+        if self.license_link is not None:
+            gguf_writer.add_license_link(self.license_link)
+
+        if self.url is not None:
+            gguf_writer.add_url(self.url)
+        if self.doi is not None:
+            gguf_writer.add_doi(self.doi)
+        if self.uuid is not None:
+            gguf_writer.add_uuid(self.uuid)
+        if self.repo_url is not None:
+            gguf_writer.add_repo_url(self.repo_url)
+
+        if self.source_url is not None:
+            gguf_writer.add_source_url(self.source_url)
+        if self.source_doi is not None:
+            gguf_writer.add_source_doi(self.source_doi)
+        if self.source_uuid is not None:
+            gguf_writer.add_source_uuid(self.source_uuid)
+        if self.source_repo_url is not None:
+            gguf_writer.add_source_repo_url(self.source_repo_url)
+
+        if self.base_models is not None:
+            gguf_writer.add_base_model_count(len(self.base_models))
+            for key, base_model_entry in enumerate(self.base_models):
+                if "name" in base_model_entry:
+                    gguf_writer.add_base_model_name(key, base_model_entry["name"])
+                if "author" in base_model_entry:
+                    gguf_writer.add_base_model_author(key, base_model_entry["author"])
+                if "version" in base_model_entry:
+                    gguf_writer.add_base_model_version(key, base_model_entry["version"])
+                if "organization" in base_model_entry:
+                    gguf_writer.add_base_model_organization(
+                        key, base_model_entry["organization"]
+                    )
+                if "description" in base_model_entry:
+                    gguf_writer.add_base_model_description(
+                        key, base_model_entry["description"]
+                    )
+                if "url" in base_model_entry:
+                    gguf_writer.add_base_model_url(key, base_model_entry["url"])
+                if "doi" in base_model_entry:
+                    gguf_writer.add_base_model_doi(key, base_model_entry["doi"])
+                if "uuid" in base_model_entry:
+                    gguf_writer.add_base_model_uuid(key, base_model_entry["uuid"])
+                if "repo_url" in base_model_entry:
+                    gguf_writer.add_base_model_repo_url(
+                        key, base_model_entry["repo_url"]
+                    )
+
+        if self.datasets is not None:
+            gguf_writer.add_dataset_count(len(self.datasets))
+            for key, dataset_entry in enumerate(self.datasets):
+                if "name" in dataset_entry:
+                    gguf_writer.add_dataset_name(key, dataset_entry["name"])
+                if "author" in dataset_entry:
+                    gguf_writer.add_dataset_author(key, dataset_entry["author"])
+                if "version" in dataset_entry:
+                    gguf_writer.add_dataset_version(key, dataset_entry["version"])
+                if "organization" in dataset_entry:
+                    gguf_writer.add_dataset_organization(
+                        key, dataset_entry["organization"]
+                    )
+                if "description" in dataset_entry:
+                    gguf_writer.add_dataset_description(
+                        key, dataset_entry["description"]
+                    )
+                if "url" in dataset_entry:
+                    gguf_writer.add_dataset_url(key, dataset_entry["url"])
+                if "doi" in dataset_entry:
+                    gguf_writer.add_dataset_doi(key, dataset_entry["doi"])
+                if "uuid" in dataset_entry:
+                    gguf_writer.add_dataset_uuid(key, dataset_entry["uuid"])
+                if "repo_url" in dataset_entry:
+                    gguf_writer.add_dataset_repo_url(key, dataset_entry["repo_url"])
+
+        if self.tags is not None:
+            gguf_writer.add_tags(self.tags)
+        if self.languages is not None:
+            gguf_writer.add_languages(self.languages)
--- a/src/gguf/quants.py
+++ b/src/gguf/quants.py
--- a/src/gguf/tensor_mapping.py
+++ b/src/gguf/tensor_mapping.py
@ -0,0 +1,884 @@
+from __future__ import annotations
+
+from typing import Sequence
+
+from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES
+
+
+class TensorNameMap:
+    mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
+        # Token embeddings
+        MODEL_TENSOR.TOKEN_EMBD: (
+            "gpt_neox.embed_in",  # gptneox
+            "transformer.wte",  # gpt2 gpt-j mpt refact qwen dbrx jais exaone
+            "transformer.word_embeddings",  # falcon
+            "word_embeddings",  # bloom
+            "model.embed_tokens",  # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414
+            "tok_embeddings",  # llama-pth
+            "embeddings.word_embeddings",  # bert nomic-bert
+            "language_model.embedding.word_embeddings",  # persimmon
+            "wte",  # gpt2
+            "transformer.embd.wte",  # phi2
+            "model.tok_embeddings",  # internlm2
+            "model.embedding",  # mamba-qbert
+            "backbone.embedding",  # mamba
+            "backbone.embeddings",  # mamba-hf
+            "transformer.in_out_embed",  # Grok
+            "embedding.word_embeddings",  # chatglm
+            "transformer.token_embeddings",  # openelm
+            "shared",  # t5
+            "rwkv.embeddings",  # rwkv6
+            "model.embeddings",  # rwkv7
+            "model.word_embeddings",  # bailingmoe
+            "language_model.model.embed_tokens",  # llama4
+        ),
+        # Token type embeddings
+        MODEL_TENSOR.TOKEN_TYPES: (
+            "embeddings.token_type_embeddings",  # bert nomic-bert
+        ),
+        # Normalization of token embeddings
+        MODEL_TENSOR.TOKEN_EMBD_NORM: (
+            "word_embeddings_layernorm",  # bloom
+            "embeddings.LayerNorm",  # bert
+            "emb_ln",  # nomic-bert
+            "transformer.norm",  # openelm
+            "rwkv.blocks.0.pre_ln",  # rwkv
+            "rwkv.blocks.0.pre_ln",  # rwkv6
+            "model.pre_ln",  # rwkv7
+            "model.layers.0.pre_norm",  # rwkv7
+            "backbone.norm",  # wavtokenizer
+        ),
+        # Position embeddings
+        MODEL_TENSOR.POS_EMBD: (
+            "transformer.wpe",  # gpt2
+            "embeddings.position_embeddings",  # bert
+            "wpe",  # gpt2
+        ),
+        # Output
+        MODEL_TENSOR.OUTPUT: (
+            "embed_out",  # gptneox
+            "lm_head",  # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe
+            "output",  # llama-pth bloom internlm2
+            "word_embeddings_for_head",  # persimmon
+            "lm_head.linear",  # phi2
+            "output_layer",  # chatglm
+            "head",  # rwkv
+            "head.out",  # wavtokenizer
+            "lm_head",  # llama4
+        ),
+        # Output norm
+        MODEL_TENSOR.OUTPUT_NORM: (
+            "gpt_neox.final_layer_norm",  # gptneox
+            "transformer.ln_f",  # gpt2 gpt-j falcon jais exaone
+            "model.norm",  # llama-hf baichuan internlm2 olmoe olmo2 phimoe
+            "norm",  # llama-pth
+            "transformer.norm_f",  # mpt dbrx
+            "ln_f",  # refact bloom qwen gpt2
+            "language_model.encoder.final_layernorm",  # persimmon
+            "model.final_layernorm",  # persimmon
+            "lm_head.ln",  # phi2
+            "model.norm_f",  # mamba-qbert
+            "backbone.norm_f",  # mamba
+            "transformer.rms_norm",  # Grok
+            "encoder.final_layernorm",  # chatglm
+            "transformer.norm",  # openelm
+            "model.norm",  # nemotron
+            "rwkv.ln_out",  # rwkv6
+            "model.ln_out",  # rwkv7
+            "backbone.final_layer_norm",  # wavtokenizer
+            "model.norm",  # llama4
+        ),
+        # Rope frequencies
+        MODEL_TENSOR.ROPE_FREQS: (
+            "rope.freqs",  # llama-pth
+            "rotary_pos_emb.inv_freq",  # chatglm
+        ),
+        MODEL_TENSOR.ROPE_FACTORS_LONG: (),
+        MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
+        MODEL_TENSOR.CONV1D: ("backbone.embed",),  # roberta
+    }
+
+    block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
+        # Attention norm
+        MODEL_TENSOR.ATTN_NORM: (
+            "gpt_neox.layers.{bid}.input_layernorm",  # gptneox
+            "transformer.h.{bid}.ln_1",  # gpt2 gpt-j refact qwen jais exaone
+            "transformer.blocks.{bid}.norm_1",  # mpt
+            "transformer.h.{bid}.input_layernorm",  # falcon7b
+            "h.{bid}.input_layernorm",  # bloom
+            "transformer.h.{bid}.ln_mlp",  # falcon40b
+            "model.layers.{bid}.input_layernorm",  # llama-hf nemotron olmoe phimoe
+            "layers.{bid}.attention_norm",  # llama-pth
+            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
+            "model.layers.{bid}.ln1",  # yi
+            "h.{bid}.ln_1",  # gpt2
+            "transformer.h.{bid}.ln",  # phi2
+            "model.layers.layers.{bid}.norm",  # plamo
+            "model.layers.{bid}.attention_norm",  # internlm2
+            "model.layers.{bid}.norm",  # mamba-qbert
+            "backbone.layers.{bid}.norm",  # mamba
+            "transformer.decoder_layer.{bid}.rms_norm",  # Grok
+            "transformer.blocks.{bid}.norm_attn_norm.norm_1",  # dbrx
+            "encoder.layers.{bid}.input_layernorm",  # chatglm
+            "transformer.layers.{bid}.attn_norm",  # openelm
+            "rwkv.blocks.{bid}.ln1",  # rwkv6
+            "model.layers.{bid}.ln1",  # rwkv7
+            "model.layers.{bid}.input_layernorm",  # llama4
+        ),
+        # Attention norm 2
+        MODEL_TENSOR.ATTN_NORM_2: (
+            "transformer.h.{bid}.ln_attn",  # falcon40b
+            "encoder.layer.{bid}.layer_norm_1",  # jina-v2-code
+            "rwkv.blocks.{bid}.ln2",  # rwkv6
+            "model.layers.{bid}.ln2",  # rwkv7
+        ),
+        # Attention query-key-value
+        MODEL_TENSOR.ATTN_QKV: (
+            "gpt_neox.layers.{bid}.attention.query_key_value",  # gptneox
+            "transformer.h.{bid}.attn.c_attn",  # gpt2 qwen jais
+            "transformer.blocks.{bid}.attn.Wqkv",  # mpt
+            "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv",  # dbrx
+            "transformer.h.{bid}.self_attention.query_key_value",  # falcon
+            "h.{bid}.self_attention.query_key_value",  # bloom
+            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
+            "model.layers.{bid}.self_attn.query_key_value",  # persimmon
+            "h.{bid}.attn.c_attn",  # gpt2
+            "transformer.h.{bid}.mixer.Wqkv",  # phi2
+            "encoder.layers.{bid}.attn.Wqkv",  # nomic-bert
+            "model.layers.{bid}.self_attn.qkv_proj",  # phi3
+            "encoder.layers.{bid}.self_attention.query_key_value",  # chatglm
+            "transformer.layers.{bid}.attn.qkv_proj",  # openelm
+        ),
+        # Attention query
+        MODEL_TENSOR.ATTN_Q: (
+            "model.layers.{bid}.self_attn.q_proj",  # llama-hf nemotron olmoe olmo2 phimoe
+            "model.layers.{bid}.self_attn.q_proj_no_perm",  # llama-custom
+            "layers.{bid}.attention.wq",  # llama-pth
+            "encoder.layer.{bid}.attention.self.query",  # bert
+            "transformer.h.{bid}.attn.q_proj",  # gpt-j
+            "model.layers.layers.{bid}.self_attn.q_proj",  # plamo
+            "model.layers.{bid}.attention.wq",  # internlm2
+            "transformer.decoder_layer.{bid}.multi_head_attention.query",  # Grok
+            "transformer.h.{bid}.attn.attention.q_proj",  # exaone
+            "model.layers.{bid}.self_attn.q_proj",  # llama4
+        ),
+        # Attention key
+        MODEL_TENSOR.ATTN_K: (
+            "model.layers.{bid}.self_attn.k_proj",  # llama-hf nemotron olmoe olmo2 phimoe
+            "model.layers.{bid}.self_attn.k_proj_no_perm",  # llama-custom
+            "layers.{bid}.attention.wk",  # llama-pth
+            "encoder.layer.{bid}.attention.self.key",  # bert
+            "transformer.h.{bid}.attn.k_proj",  # gpt-j
+            "transformer.h.{bid}.attn.k",  # refact
+            "model.layers.layers.{bid}.self_attn.k_proj",  # plamo
+            "model.layers.{bid}.attention.wk",  # internlm2
+            "transformer.decoder_layer.{bid}.multi_head_attention.key",  # Grok
+            "transformer.h.{bid}.attn.attention.k_proj",  # exaone
+            "model.layers.{bid}.self_attn.k_proj",  # llama4
+        ),
+        # Attention value
+        MODEL_TENSOR.ATTN_V: (
+            "model.layers.{bid}.self_attn.v_proj",  # llama-hf nemotron olmoe olmo2 phimoe
+            "layers.{bid}.attention.wv",  # llama-pth
+            "encoder.layer.{bid}.attention.self.value",  # bert
+            "transformer.h.{bid}.attn.v_proj",  # gpt-j
+            "transformer.h.{bid}.attn.v",  # refact
+            "model.layers.layers.{bid}.self_attn.v_proj",  # plamo
+            "model.layers.{bid}.attention.wv",  # internlm2
+            "transformer.decoder_layer.{bid}.multi_head_attention.value",  # Grok
+            "transformer.h.{bid}.attn.attention.v_proj",  # exaone
+            "model.layers.{bid}.self_attn.v_proj",  # llama4
+        ),
+        # Attention output
+        MODEL_TENSOR.ATTN_OUT: (
+            "gpt_neox.layers.{bid}.attention.dense",  # gptneox
+            "transformer.h.{bid}.attn.c_proj",  # gpt2 refact qwen jais
+            "transformer.blocks.{bid}.attn.out_proj",  # mpt
+            "transformer.h.{bid}.self_attention.dense",  # falcon
+            "h.{bid}.self_attention.dense",  # bloom
+            "model.layers.{bid}.self_attn.o_proj",  # llama-hf nemotron olmoe olmo2 phimoe
+            "model.layers.{bid}.self_attn.linear_attn",  # deci
+            "layers.{bid}.attention.wo",  # llama-pth
+            "encoder.layer.{bid}.attention.output.dense",  # bert
+            "transformer.h.{bid}.attn.out_proj",  # gpt-j
+            "language_model.encoder.layers.{bid}.self_attention.dense",  # persimmon
+            "model.layers.{bid}.self_attn.dense",  # persimmon
+            "h.{bid}.attn.c_proj",  # gpt2
+            "transformer.h.{bid}.mixer.out_proj",  # phi2
+            "model.layers.layers.{bid}.self_attn.o_proj",  # plamo
+            "model.layers.{bid}.attention.wo",  # internlm2
+            "encoder.layers.{bid}.attn.out_proj",  # nomic-bert
+            "transformer.decoder_layer.{bid}.multi_head_attention.linear",  # Grok
+            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj",  # dbrx
+            "encoder.layers.{bid}.self_attention.dense",  # chatglm
+            "transformer.layers.{bid}.attn.out_proj",  # openelm
+            "transformer.h.{bid}.attn.attention.out_proj",  # exaone
+            "model.layers.{bid}.self_attn.o_proj",  # llama4
+        ),
+        # Attention output norm
+        MODEL_TENSOR.ATTN_OUT_NORM: (
+            "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
+            "encoder.layers.{bid}.norm1",  # nomic-bert
+            "transformer.decoder_layer.{bid}.rms_norm_1",  # Grok
+            "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
+        ),
+        MODEL_TENSOR.ATTN_POST_NORM: (
+            "model.layers.{bid}.post_attention_layernorm",  # gemma2 olmo2    # ge
+            "model.layers.{bid}.post_self_attn_layernorm",  # glm-4-0414
+        ),
+        # Rotary embeddings
+        MODEL_TENSOR.ATTN_ROT_EMBD: (
+            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",  # llama-hf
+            "layers.{bid}.attention.inner_attention.rope.freqs",  # llama-pth
+            "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq",  # plamo
+            "transformer.h.{bid}.attn.rotary_emb.inv_freq",  # codeshell
+        ),
+        # Feed-forward norm
+        MODEL_TENSOR.FFN_NORM: (
+            "gpt_neox.layers.{bid}.post_attention_layernorm",  # gptneox
+            "transformer.h.{bid}.ln_2",  # gpt2 refact qwen jais exaone
+            "h.{bid}.post_attention_layernorm",  # bloom
+            "transformer.blocks.{bid}.norm_2",  # mpt
+            "model.layers.{bid}.post_attention_layernorm",  # llama-hf nemotron olmoe phimoe
+            "layers.{bid}.ffn_norm",  # llama-pth
+            "language_model.encoder.layers.{bid}.post_attention_layernorm",  # persimmon
+            "model.layers.{bid}.ln2",  # yi
+            "h.{bid}.ln_2",  # gpt2
+            "model.layers.{bid}.ffn_norm",  # internlm2
+            "transformer.decoder_layer.{bid}.rms_norm_2",  # Grok
+            "encoder.layers.{bid}.post_attention_layernorm",  # chatglm
+            "transformer.layers.{bid}.ffn_norm",  # openelm
+            "model.layers.{bid}.post_attention_layernorm",  # llama4
+        ),
+        # Post feed-forward norm
+        MODEL_TENSOR.FFN_PRE_NORM: (
+            "model.layers.{bid}.pre_feedforward_layernorm",  # gemma2
+        ),
+        # Post feed-forward norm
+        MODEL_TENSOR.FFN_POST_NORM: (
+            "model.layers.{bid}.post_feedforward_layernorm",  # gemma2 olmo2
+            "model.layers.{bid}.post_mlp_layernorm",  # glm-4-0414
+        ),
+        MODEL_TENSOR.FFN_GATE_INP: (
+            "layers.{bid}.feed_forward.gate",  # mixtral
+            "model.layers.{bid}.block_sparse_moe.gate",  # mixtral phimoe
+            "model.layers.{bid}.mlp.gate",  # qwen2moe olmoe
+            "transformer.decoder_layer.{bid}.router",  # Grok
+            "transformer.blocks.{bid}.ffn.router.layer",  # dbrx
+            "model.layers.{bid}.block_sparse_moe.router.layer",  # granitemoe
+            "model.layers.{bid}.feed_forward.router",  # llama4
+            "encoder.layers.{bid}.mlp.router.layer",  # nomic-bert-moe
+        ),
+        MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert_gate",  # qwen2moe
+        ),
+        MODEL_TENSOR.FFN_EXP_PROBS_B: (
+            "model.layers.{bid}.mlp.gate.e_score_correction",  # deepseek-v3
+        ),
+        # Feed-forward up
+        MODEL_TENSOR.FFN_UP: (
+            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",  # gptneox
+            "transformer.h.{bid}.mlp.c_fc",  # gpt2 jais
+            "transformer.blocks.{bid}.ffn.up_proj",  # mpt
+            "transformer.h.{bid}.mlp.dense_h_to_4h",  # falcon
+            "h.{bid}.mlp.dense_h_to_4h",  # bloom
+            "model.layers.{bid}.mlp.up_proj",  # llama-hf refact nemotron olmo2
+            "layers.{bid}.feed_forward.w3",  # llama-pth
+            "encoder.layer.{bid}.intermediate.dense",  # bert
+            "transformer.h.{bid}.mlp.fc_in",  # gpt-j
+            "transformer.h.{bid}.mlp.linear_3",  # refact
+            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
+            "model.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
+            "transformer.h.{bid}.mlp.w1",  # qwen
+            "h.{bid}.mlp.c_fc",  # gpt2
+            "transformer.h.{bid}.mlp.fc1",  # phi2
+            "model.layers.{bid}.mlp.fc1",  # phi2
+            "model.layers.{bid}.mlp.gate_up_proj",  # phi3 glm-4-0414
+            "model.layers.layers.{bid}.mlp.up_proj",  # plamo
+            "model.layers.{bid}.feed_forward.w3",  # internlm2
+            "encoder.layers.{bid}.mlp.fc11",  # nomic-bert
+            "encoder.layers.{bid}.mlp.fc1",  # nomic-bert-moe
+            "model.layers.{bid}.mlp.c_fc",  # starcoder2
+            "encoder.layer.{bid}.mlp.gated_layers_v",  # jina-bert-v2
+            "model.layers.{bid}.residual_mlp.w3",  # arctic
+            "encoder.layers.{bid}.mlp.dense_h_to_4h",  # chatglm
+            "transformer.h.{bid}.mlp.c_fc_1",  # exaone
+            "model.layers.{bid}.feed_forward.up_proj",  # llama4
+        ),
+        MODEL_TENSOR.FFN_UP_EXP: (
+            "layers.{bid}.feed_forward.experts.w3",  # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear_v",  # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.v1",  # dbrx
+            "model.layers.{bid}.mlp.experts.up_proj",  # qwen2moe olmoe (merged)
+            "model.layers.{bid}.block_sparse_moe.experts.w3",  # phimoe (merged)
+            "model.layers.{bid}.feed_forward.experts.up_proj",  # llama4
+            "encoder.layers.{bid}.mlp.experts.mlp.w1",  # nomic-bert-moe
+        ),
+        MODEL_TENSOR.FFN_UP_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert.up_proj",  # qwen2moe
+            "model.layers.{bid}.mlp.shared_experts.up_proj",  # deepseek deepseek2
+            "model.layers.{bid}.feed_forward.shared_expert.up_proj",  # llama4
+        ),
+        # AWQ-activation gate
+        MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",),  # mpt
+        # Feed-forward gate
+        MODEL_TENSOR.FFN_GATE: (
+            "model.layers.{bid}.mlp.gate_proj",  # llama-hf refact olmo2
+            "layers.{bid}.feed_forward.w1",  # llama-pth
+            "transformer.h.{bid}.mlp.w2",  # qwen
+            "transformer.h.{bid}.mlp.c_fc2",  # jais
+            "model.layers.layers.{bid}.mlp.gate_proj",  # plamo
+            "model.layers.{bid}.feed_forward.w1",  # internlm2
+            "encoder.layers.{bid}.mlp.fc12",  # nomic-bert
+            "encoder.layer.{bid}.mlp.gated_layers_w",  # jina-bert-v2
+            "transformer.h.{bid}.mlp.linear_1",  # refact
+            "model.layers.{bid}.residual_mlp.w1",  # arctic
+            "transformer.h.{bid}.mlp.c_fc_0",  # exaone
+            "model.layers.{bid}.feed_forward.gate_proj",  # llama4
+        ),
+        MODEL_TENSOR.FFN_GATE_EXP: (
+            "layers.{bid}.feed_forward.experts.w1",  # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear",  # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.w1",  # dbrx
+            "model.layers.{bid}.mlp.experts.gate_proj",  # qwen2moe olmoe (merged)
+            "model.layers.{bid}.block_sparse_moe.experts.w1",  # phimoe (merged)
+            "model.layers.{bid}.feed_forward.experts.gate_proj",  # llama4
+        ),
+        MODEL_TENSOR.FFN_GATE_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert.gate_proj",  # qwen2moe
+            "model.layers.{bid}.mlp.shared_experts.gate_proj",  # deepseek deepseek2
+            "model.layers.{bid}.feed_forward.shared_expert.gate_proj",  # llama4
+        ),
+        # Feed-forward down
+        MODEL_TENSOR.FFN_DOWN: (
+            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",  # gptneox
+            "transformer.h.{bid}.mlp.c_proj",  # gpt2 refact qwen jais
+            "transformer.blocks.{bid}.ffn.down_proj",  # mpt
+            "transformer.h.{bid}.mlp.dense_4h_to_h",  # falcon
+            "h.{bid}.mlp.dense_4h_to_h",  # bloom
+            "model.layers.{bid}.mlp.down_proj",  # llama-hf nemotron olmo2
+            "layers.{bid}.feed_forward.w2",  # llama-pth
+            "encoder.layer.{bid}.output.dense",  # bert
+            "transformer.h.{bid}.mlp.fc_out",  # gpt-j
+            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
+            "model.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
+            "h.{bid}.mlp.c_proj",  # gpt2
+            "transformer.h.{bid}.mlp.fc2",  # phi2
+            "model.layers.{bid}.mlp.fc2",  # phi2
+            "model.layers.layers.{bid}.mlp.down_proj",  # plamo
+            "model.layers.{bid}.feed_forward.w2",  # internlm2
+            "encoder.layers.{bid}.mlp.fc2",  # nomic-bert
+            "model.layers.{bid}.mlp.c_proj",  # starcoder2
+            "encoder.layer.{bid}.mlp.wo",  # jina-bert-v2
+            "transformer.layers.{bid}.ffn.proj_2",  # openelm
+            "model.layers.{bid}.residual_mlp.w2",  # arctic
+            "encoder.layer.{bid}.mlp.down_layer",  # jina-bert-v2
+            "encoder.layers.{bid}.mlp.dense_4h_to_h",  # chatglm
+            "model.layers.h.{bid}.mlp.c_proj",  # exaone
+            "model.layers.{bid}.feed_forward.down_proj",  # llama4
+        ),
+        MODEL_TENSOR.FFN_DOWN_EXP: (
+            "layers.{bid}.feed_forward.experts.w2",  # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear_1",  # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.w2",  # dbrx
+            "model.layers.{bid}.mlp.experts.down_proj",  # qwen2moe olmoe (merged)
+            "model.layers.{bid}.block_sparse_moe.output_linear",  # granitemoe
+            "model.layers.{bid}.block_sparse_moe.experts.w2",  # phimoe (merged)
+            "model.layers.{bid}.feed_forward.experts.down_proj",  # llama4
+            "encoder.layers.{bid}.mlp.experts.mlp.w2",  # nomic-bert-moe
+        ),
+        MODEL_TENSOR.FFN_DOWN_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert.down_proj",  # qwen2moe
+            "model.layers.{bid}.mlp.shared_experts.down_proj",  # deepseek deepseek2
+            "model.layers.{bid}.feed_forward.shared_expert.down_proj",  # llama4
+            "model.layers.{bid}.shared_mlp.output_linear",  # granitemoe
+        ),
+        MODEL_TENSOR.ATTN_Q_NORM: (
+            "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
+            "model.layers.{bid}.self_attn.q_layernorm",  # persimmon
+            "model.layers.{bid}.self_attn.q_norm",  # cohere olmoe chameleon olmo2
+            "transformer.blocks.{bid}.attn.q_ln",  # sea-lion
+            "encoder.layer.{bid}.attention.self.layer_norm_q",  # jina-bert-v2
+            "transformer.layers.{bid}.attn.q_norm",  # openelm
+        ),
+        MODEL_TENSOR.ATTN_K_NORM: (
+            "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
+            "model.layers.{bid}.self_attn.k_layernorm",  # persimmon
+            "model.layers.{bid}.self_attn.k_norm",  # cohere olmoe chameleon olmo2
+            "transformer.blocks.{bid}.attn.k_ln",  # sea-lion
+            "encoder.layer.{bid}.attention.self.layer_norm_k",  # jina-bert-v2
+            "transformer.layers.{bid}.attn.k_norm",  # openelm
+        ),
+        MODEL_TENSOR.ROPE_FREQS: (
+            "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq",  # persimmon
+        ),
+        MODEL_TENSOR.LAYER_OUT_NORM: (
+            "encoder.layer.{bid}.output.LayerNorm",  # bert
+            "encoder.layers.{bid}.norm2",  # nomic-bert
+            "transformer.decoder_layer.{bid}.rms_norm_3",  # Grok
+            "encoder.layer.{bid}.mlp.layernorm",  # jina-bert-v2
+            "encoder.layer.{bid}.layer_norm_2",  # jina-v2-code
+        ),
+        MODEL_TENSOR.SSM_IN: (
+            "model.layers.{bid}.in_proj",
+            "backbone.layers.{bid}.mixer.in_proj",
+        ),
+        MODEL_TENSOR.SSM_CONV1D: (
+            "model.layers.{bid}.conv1d",
+            "backbone.layers.{bid}.mixer.conv1d",
+        ),
+        MODEL_TENSOR.SSM_X: (
+            "model.layers.{bid}.x_proj",
+            "backbone.layers.{bid}.mixer.x_proj",
+        ),
+        MODEL_TENSOR.SSM_DT: (
+            "model.layers.{bid}.dt_proj",
+            "backbone.layers.{bid}.mixer.dt_proj",
+        ),
+        MODEL_TENSOR.SSM_A: (
+            "model.layers.{bid}.A_log",
+            "backbone.layers.{bid}.mixer.A_log",
+        ),
+        MODEL_TENSOR.SSM_D: (
+            "model.layers.{bid}.D",
+            "backbone.layers.{bid}.mixer.D",
+        ),
+        MODEL_TENSOR.SSM_OUT: (
+            "model.layers.{bid}.out_proj",
+            "backbone.layers.{bid}.mixer.out_proj",
+        ),
+        MODEL_TENSOR.TIME_MIX_W0: ("model.layers.{bid}.attention.w0",),  # rwkv7
+        MODEL_TENSOR.TIME_MIX_W1: (
+            "rwkv.blocks.{bid}.attention.time_maa_w1",  # rwkv6
+            "model.layers.{bid}.self_attn.time_maa_w1",  # rwkv6qwen2
+            "model.layers.{bid}.attention.w1",  # rwkv7
+        ),
+        MODEL_TENSOR.TIME_MIX_W2: (
+            "rwkv.blocks.{bid}.attention.time_maa_w2",  # rwkv6
+            "model.layers.{bid}.self_attn.time_maa_w2",  # rwkv6qwen2
+            "model.layers.{bid}.attention.w2",  # rwkv7
+        ),
+        MODEL_TENSOR.TIME_MIX_A0: ("model.layers.{bid}.attention.a0",),  # rwkv7
+        MODEL_TENSOR.TIME_MIX_A1: ("model.layers.{bid}.attention.a1",),  # rwkv7
+        MODEL_TENSOR.TIME_MIX_A2: ("model.layers.{bid}.attention.a2",),  # rwkv7
+        MODEL_TENSOR.TIME_MIX_V0: ("model.layers.{bid}.attention.v0",),  # rwkv7
+        MODEL_TENSOR.TIME_MIX_V1: ("model.layers.{bid}.attention.v1",),  # rwkv7
+        MODEL_TENSOR.TIME_MIX_V2: ("model.layers.{bid}.attention.v2",),  # rwkv7
+        MODEL_TENSOR.TIME_MIX_G1: ("model.layers.{bid}.attention.g1",),  # rwkv7
+        MODEL_TENSOR.TIME_MIX_G2: ("model.layers.{bid}.attention.g2",),  # rwkv7
+        MODEL_TENSOR.TIME_MIX_K_K: ("model.layers.{bid}.attention.k_k",),  # rwkv7
+        MODEL_TENSOR.TIME_MIX_K_A: ("model.layers.{bid}.attention.k_a",),  # rwkv7
+        MODEL_TENSOR.TIME_MIX_R_K: ("model.layers.{bid}.attention.r_k",),  # rwkv7
+        MODEL_TENSOR.TIME_MIX_LERP_X: (
+            "rwkv.blocks.{bid}.attention.time_maa_x",  # rwkv6
+            "model.layers.{bid}.self_attn.time_maa_x",  # rwkv6qwen2
+        ),
+        MODEL_TENSOR.TIME_MIX_LERP_K: (
+            "rwkv.blocks.{bid}.attention.time_maa_k",  # rwkv6
+            "model.layers.{bid}.self_attn.time_maa_k",  # rwkv6qwen2
+        ),
+        MODEL_TENSOR.TIME_MIX_LERP_V: (
+            "rwkv.blocks.{bid}.attention.time_maa_v",  # rwkv6
+            "model.layers.{bid}.self_attn.time_maa_v",  # rwkv6qwen2
+        ),
+        MODEL_TENSOR.TIME_MIX_LERP_R: (
+            "rwkv.blocks.{bid}.attention.time_maa_r",  # rwkv6
+            "model.layers.{bid}.self_attn.time_maa_r",  # rwkv6qwen2
+        ),
+        MODEL_TENSOR.TIME_MIX_LERP_G: (
+            "rwkv.blocks.{bid}.attention.time_maa_g",  # rwkv6
+            "model.layers.{bid}.self_attn.time_maa_g",  # rwkv6qwen2
+        ),
+        MODEL_TENSOR.TIME_MIX_LERP_W: (
+            "rwkv.blocks.{bid}.attention.time_maa_w",  # rwkv6
+            "model.layers.{bid}.self_attn.time_maa_w",  # rwkv6qwen2
+        ),
+        MODEL_TENSOR.TIME_MIX_FIRST: (
+            "rwkv.blocks.{bid}.attention.time_faaaa",  # rwkv6
+        ),
+        MODEL_TENSOR.TIME_MIX_DECAY: (
+            "rwkv.blocks.{bid}.attention.time_decay",  # rwkv6
+            "model.layers.{bid}.self_attn.time_decay",  # rwkv6qwen2
+        ),
+        MODEL_TENSOR.TIME_MIX_DECAY_W1: (
+            "rwkv.blocks.{bid}.attention.time_decay_w1",  # rwkv6
+            "model.layers.{bid}.self_attn.time_decay_w1",  # rwkv6qwen2
+        ),
+        MODEL_TENSOR.TIME_MIX_DECAY_W2: (
+            "rwkv.blocks.{bid}.attention.time_decay_w2",  # rwkv6
+            "model.layers.{bid}.self_attn.time_decay_w2",  # rwkv6qwen2
+        ),
+        MODEL_TENSOR.TIME_MIX_KEY: (
+            "rwkv.blocks.{bid}.attention.key",  # rwkv6
+            "model.layers.{bid}.self_attn.k_proj",  # rwkv6qwen2
+            "model.layers.{bid}.attention.key",  # rwkv7
+            "model.layers.{bid}.attention.k_proj",  # rwkv7
+        ),
+        MODEL_TENSOR.TIME_MIX_VALUE: (
+            "rwkv.blocks.{bid}.attention.value",  # rwkv6
+            "model.layers.{bid}.self_attn.v_proj",  # rwkv6qwen2
+            "model.layers.{bid}.attention.value",  # rwkv7
+            "model.layers.{bid}.attention.v_proj",  # rwkv7
+        ),
+        MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
+            "rwkv.blocks.{bid}.attention.receptance",  # rwkv6
+            "model.layers.{bid}.self_attn.q_proj",  # rwkv6qwen2
+            "model.layers.{bid}.attention.receptance",  # rwkv7
+            "model.layers.{bid}.attention.r_proj",  # rwkv7
+        ),
+        MODEL_TENSOR.TIME_MIX_GATE: (
+            "rwkv.blocks.{bid}.attention.gate",  # rwkv6
+            "model.layers.{bid}.self_attn.gate",  # rwkv6qwen2
+        ),
+        MODEL_TENSOR.TIME_MIX_LN: (
+            "rwkv.blocks.{bid}.attention.ln_x",  # rwkv6
+            "model.layers.{bid}.attention.ln_x",  # rwkv7
+        ),
+        MODEL_TENSOR.TIME_MIX_OUTPUT: (
+            "rwkv.blocks.{bid}.attention.output",  # rwkv6
+            "model.layers.{bid}.self_attn.o_proj",  # rwkv6qwen2
+            "model.layers.{bid}.attention.output",  # rwkv7
+            "model.layers.{bid}.attention.o_proj",  # rwkv7
+        ),
+        MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
+            "rwkv.blocks.{bid}.feed_forward.time_maa_k",  # rwkv6
+            "model.layers.{bid}.feed_forward.x_k",  # rwkv7
+        ),
+        MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
+            "rwkv.blocks.{bid}.feed_forward.time_maa_r",  # rwkv6
+        ),
+        MODEL_TENSOR.CHANNEL_MIX_KEY: (
+            "rwkv.blocks.{bid}.feed_forward.key",  # rwkv6
+            "model.layers.{bid}.feed_forward.key",  # rwkv7
+        ),
+        MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
+            "rwkv.blocks.{bid}.feed_forward.receptance",  # rwkv6
+        ),
+        MODEL_TENSOR.CHANNEL_MIX_VALUE: (
+            "rwkv.blocks.{bid}.feed_forward.value",  # rwkv6
+            "model.layers.{bid}.feed_forward.value",  # rwkv7
+        ),
+        MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",),  # deepseek2
+        MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",),  # deepseek2
+        MODEL_TENSOR.ATTN_KV_A_MQA: (
+            "model.layers.{bid}.self_attn.kv_a_proj_with_mqa",  # deepseek2
+        ),
+        MODEL_TENSOR.ATTN_KV_B: (
+            "model.layers.{bid}.self_attn.kv_b_proj",  # deepseek2
+        ),
+        MODEL_TENSOR.ATTN_K_B: ("model.layers.{bid}.self_attn.k_b_proj",),  # deepseek2
+        MODEL_TENSOR.ATTN_V_B: ("model.layers.{bid}.self_attn.v_b_proj",),  # deepseek2
+        MODEL_TENSOR.ATTN_Q_A_NORM: (
+            "model.layers.{bid}.self_attn.q_a_layernorm",  # deepseek2
+        ),
+        MODEL_TENSOR.ATTN_KV_A_NORM: (
+            "model.layers.{bid}.self_attn.kv_a_layernorm",  # deepseek2
+        ),
+        MODEL_TENSOR.ATTN_SUB_NORM: (
+            "model.layers.{bid}.self_attn.inner_attn_ln",  # bitnet
+        ),
+        MODEL_TENSOR.FFN_SUB_NORM: ("model.layers.{bid}.mlp.ffn_layernorm",),  # bitnet
+        MODEL_TENSOR.DEC_ATTN_NORM: ("decoder.block.{bid}.layer.0.layer_norm",),  # t5
+        MODEL_TENSOR.DEC_ATTN_Q: ("decoder.block.{bid}.layer.0.SelfAttention.q",),  # t5
+        MODEL_TENSOR.DEC_ATTN_K: ("decoder.block.{bid}.layer.0.SelfAttention.k",),  # t5
+        MODEL_TENSOR.DEC_ATTN_V: ("decoder.block.{bid}.layer.0.SelfAttention.v",),  # t5
+        MODEL_TENSOR.DEC_ATTN_OUT: (
+            "decoder.block.{bid}.layer.0.SelfAttention.o",  # t5
+        ),
+        MODEL_TENSOR.DEC_ATTN_REL_B: (
+            "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",  # t5
+        ),
+        MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
+            "decoder.block.{bid}.layer.1.layer_norm",  # t5
+        ),
+        MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
+            "decoder.block.{bid}.layer.1.EncDecAttention.q",  # t5
+        ),
+        MODEL_TENSOR.DEC_CROSS_ATTN_K: (
+            "decoder.block.{bid}.layer.1.EncDecAttention.k",  # t5
+        ),
+        MODEL_TENSOR.DEC_CROSS_ATTN_V: (
+            "decoder.block.{bid}.layer.1.EncDecAttention.v",  # t5
+        ),
+        MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
+            "decoder.block.{bid}.layer.1.EncDecAttention.o",  # t5
+        ),
+        MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
+            "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias",  # t5
+        ),
+        MODEL_TENSOR.DEC_FFN_NORM: ("decoder.block.{bid}.layer.2.layer_norm",),  # t5
+        MODEL_TENSOR.DEC_FFN_GATE: (
+            "decoder.block.{bid}.layer.2.DenseReluDense.wi_0",  # flan-t5
+        ),
+        MODEL_TENSOR.DEC_FFN_UP: (
+            "decoder.block.{bid}.layer.2.DenseReluDense.wi",  # t5
+            "decoder.block.{bid}.layer.2.DenseReluDense.wi_1",  # flan-t5
+        ),
+        MODEL_TENSOR.DEC_FFN_DOWN: (
+            "decoder.block.{bid}.layer.2.DenseReluDense.wo",  # t5
+        ),
+        MODEL_TENSOR.DEC_OUTPUT_NORM: ("decoder.final_layer_norm",),  # t5
+        MODEL_TENSOR.ENC_ATTN_NORM: ("encoder.block.{bid}.layer.0.layer_norm",),  # t5
+        MODEL_TENSOR.ENC_ATTN_Q: ("encoder.block.{bid}.layer.0.SelfAttention.q",),  # t5
+        MODEL_TENSOR.ENC_ATTN_K: ("encoder.block.{bid}.layer.0.SelfAttention.k",),  # t5
+        MODEL_TENSOR.ENC_ATTN_V: ("encoder.block.{bid}.layer.0.SelfAttention.v",),  # t5
+        MODEL_TENSOR.ENC_ATTN_OUT: (
+            "encoder.block.{bid}.layer.0.SelfAttention.o",  # t5
+        ),
+        MODEL_TENSOR.ENC_ATTN_REL_B: (
+            "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",  # t5
+        ),
+        MODEL_TENSOR.ENC_FFN_NORM: ("encoder.block.{bid}.layer.1.layer_norm",),  # t5
+        MODEL_TENSOR.ENC_FFN_GATE: (
+            "encoder.block.{bid}.layer.1.DenseReluDense.wi_0",  # flan-t5
+        ),
+        MODEL_TENSOR.ENC_FFN_UP: (
+            "encoder.block.{bid}.layer.1.DenseReluDense.wi",  # t5
+            "encoder.block.{bid}.layer.1.DenseReluDense.wi_1",  # flan-t5
+        ),
+        MODEL_TENSOR.ENC_FFN_DOWN: (
+            "encoder.block.{bid}.layer.1.DenseReluDense.wo",  # t5
+        ),
+        ############################################################################
+        # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
+        MODEL_TENSOR.ENC_OUTPUT_NORM: ("encoder.final_layer_norm",),  # t5
+        MODEL_TENSOR.CLS: (
+            "classifier",  # jina
+            "classifier.dense",  # roberta
+        ),
+        MODEL_TENSOR.CLS_OUT: ("classifier.out_proj",),  # roberta
+        #############################################################################
+        MODEL_TENSOR.CONVNEXT_DW: ("backbone.convnext.{bid}.dwconv",),  # wavtokenizer
+        MODEL_TENSOR.CONVNEXT_NORM: ("backbone.convnext.{bid}.norm",),  # wavtokenizer
+        MODEL_TENSOR.CONVNEXT_PW1: ("backbone.convnext.{bid}.pwconv1",),  # wavtokenizer
+        MODEL_TENSOR.CONVNEXT_PW2: ("backbone.convnext.{bid}.pwconv2",),  # wavtokenizer
+        MODEL_TENSOR.CONVNEXT_GAMMA: ("backbone.convnext.{bid}.gamma",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_CONV1: ("backbone.posnet.{bid}.conv1",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_CONV2: ("backbone.posnet.{bid}.conv2",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_NORM: ("backbone.posnet.{bid}.norm",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_NORM1: ("backbone.posnet.{bid}.norm1",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_NORM2: ("backbone.posnet.{bid}.norm2",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_NORM: ("backbone.posnet.{bid}.norm",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_Q: ("backbone.posnet.{bid}.q",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_K: ("backbone.posnet.{bid}.k",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_V: ("backbone.posnet.{bid}.v",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_OUT: (
+            "backbone.posnet.{bid}.proj_out",  # wavtokenizer
+        ),
+        #############################################################################
+        ## Vision encoder
+        MODEL_TENSOR.V_MMPROJ: (
+            "multi_modal_projector.linear_{bid}",
+            "visual.merger.mlp.{bid}",  # qwen2vl
+        ),
+        MODEL_TENSOR.V_MMPROJ_FC: (
+            "model.connector.modality_projection.proj",  # SmolVLM
+        ),
+        MODEL_TENSOR.V_MMPROJ_MLP: (
+            "model.mm_projector.mlp.mlp.{bid}",
+            "mlp1.{bid}",  # InternVL
+        ),
+        MODEL_TENSOR.V_MMPROJ_PEG: ("model.mm_projector.peg.peg.{bid}",),
+        MODEL_TENSOR.V_ENC_EMBD_CLS: (
+            "vision_tower.vision_model.embeddings.class_embedding",
+        ),
+        MODEL_TENSOR.V_ENC_EMBD_PATCH: (
+            "vision_tower.vision_model.embeddings.patch_embedding",
+            "vpm.embeddings.patch_embedding",
+            "model.vision_model.embeddings.patch_embedding",  # SmolVLM
+            "vision_tower.patch_conv",  # pixtral
+            "visual.patch_embed.proj",  # qwen2vl
+        ),
+        MODEL_TENSOR.V_ENC_EMBD_POS: (
+            "vision_tower.vision_model.embeddings.position_embedding",
+            "vpm.embeddings.position_embedding",
+            "model.vision_model.embeddings.position_embedding",  # SmolVLM
+        ),
+        MODEL_TENSOR.V_ENC_ATTN_Q: (
+            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
+            "vpm.encoder.layers.{bid}.self_attn.q_proj",
+            "model.vision_model.encoder.layers.{bid}.self_attn.q_proj",  # SmolVLM
+            "vision_tower.transformer.layers.{bid}.attention.q_proj",  # pixtral
+            "visual.blocks.{bid}.attn.q",  # qwen2vl, generated
+        ),
+        MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
+            "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm",  # InternVL
+        ),
+        MODEL_TENSOR.V_ENC_ATTN_K: (
+            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
+            "vpm.encoder.layers.{bid}.self_attn.k_proj",
+            "model.vision_model.encoder.layers.{bid}.self_attn.k_proj",  # SmolVLM
+            "vision_tower.transformer.layers.{bid}.attention.k_proj",  # pixtral
+            "visual.blocks.{bid}.attn.k",  # qwen2vl, generated
+        ),
+        MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
+            "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm",  # InternVL
+        ),
+        MODEL_TENSOR.V_ENC_ATTN_V: (
+            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
+            "vpm.encoder.layers.{bid}.self_attn.v_proj",
+            "model.vision_model.encoder.layers.{bid}.self_attn.v_proj",  # SmolVLM
+            "vision_tower.transformer.layers.{bid}.attention.v_proj",  # pixtral
+            "visual.blocks.{bid}.attn.v",  # qwen2vl, generated
+        ),
+        MODEL_TENSOR.V_ENC_INPUT_NORM: (
+            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
+            "vision_tower.vision_model.encoder.layers.{bid}.norm1",  # InternVL
+            "vpm.encoder.layers.{bid}.layer_norm1",
+            "model.vision_model.encoder.layers.{bid}.layer_norm1",  # SmolVLM
+            "vision_tower.transformer.layers.{bid}.attention_norm",  # pixtral
+            "visual.blocks.{bid}.norm1",  # qwen2vl
+        ),
+        MODEL_TENSOR.V_ENC_OUTPUT: (
+            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
+            "vision_tower.vision_model.encoder.layers.{bid}.attn.proj",  # InternVL
+            "vpm.encoder.layers.{bid}.self_attn.out_proj",
+            "model.vision_model.encoder.layers.{bid}.self_attn.out_proj",  # SmolVLM
+            "vision_tower.transformer.layers.{bid}.attention.o_proj",  # pixtral
+            "visual.blocks.{bid}.attn.proj",  # qwen2vl
+        ),
+        MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
+            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
+            "vision_tower.vision_model.encoder.layers.{bid}.norm2",  # InternVL
+            "vpm.encoder.layers.{bid}.layer_norm2",
+            "model.vision_model.encoder.layers.{bid}.layer_norm2",  # SmolVLM
+            "vision_tower.transformer.layers.{bid}.ffn_norm",  # pixtral
+            "visual.blocks.{bid}.norm2",  # qwen2vl
+        ),
+        MODEL_TENSOR.V_ENC_FFN_UP: (
+            "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
+            "vpm.encoder.layers.{bid}.mlp.fc1",
+            "model.vision_model.encoder.layers.{bid}.mlp.fc1",  # SmolVLM, gemma3
+            "vision_tower.transformer.layers.{bid}.feed_forward.up_proj",  # pixtral
+            "visual.blocks.{bid}.mlp.fc1",  # qwen2vl
+            "visual.blocks.{bid}.mlp.up_proj",  # qwen2.5vl
+        ),
+        MODEL_TENSOR.V_ENC_FFN_GATE: (
+            "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj",  # pixtral
+            "visual.blocks.{bid}.mlp.gate_proj",  # qwen2.5vl
+        ),
+        MODEL_TENSOR.V_ENC_FFN_DOWN: (
+            "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
+            "vpm.encoder.layers.{bid}.mlp.fc2",
+            "model.vision_model.encoder.layers.{bid}.mlp.fc2",  # SmolVLM, gemma3
+            "vision_tower.transformer.layers.{bid}.feed_forward.down_proj",  # pixtral
+            "visual.blocks.{bid}.mlp.fc2",  # qwen2vl
+            "visual.blocks.{bid}.mlp.down_proj",  # qwen2.5vl
+        ),
+        MODEL_TENSOR.V_LAYER_SCALE_1: (
+            "vision_tower.vision_model.encoder.layers.{bid}.ls1",  # InternVL
+        ),
+        MODEL_TENSOR.V_LAYER_SCALE_2: (
+            "vision_tower.vision_model.encoder.layers.{bid}.ls2",  # InternVL
+        ),
+        MODEL_TENSOR.V_PRE_NORM: (
+            "vision_tower.vision_model.pre_layrnorm",
+            "vision_tower.ln_pre",  # pixtral
+        ),
+        MODEL_TENSOR.V_POST_NORM: (
+            "vision_tower.vision_model.post_layernorm",
+            "model.vision_model.post_layernorm",  # SmolVLM
+            "visual.merger.ln_q",  # qwen2vl
+        ),
+        MODEL_TENSOR.V_MM_INP_PROJ: ("multi_modal_projector.mm_input_projection",),
+        MODEL_TENSOR.V_MM_INP_NORM: ("multi_modal_projector.norm",),
+        MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ("multi_modal_projector.mm_soft_emb_norm",),
+        MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ("resampler.pos_embed_k",),
+        MODEL_TENSOR.V_RESMPL_ATTN_Q: (
+            "resampler.attn.in_proj_q",  # tensor generated from resampler.attn.in_proj
+        ),
+        MODEL_TENSOR.V_RESMPL_ATTN_K: (
+            "resampler.attn.in_proj_k",  # tensor generated from resampler.attn.in_proj
+        ),
+        MODEL_TENSOR.V_RESMPL_ATTN_V: (
+            "resampler.attn.in_proj_v",  # tensor generated from resampler.attn.in_proj
+        ),
+        MODEL_TENSOR.V_RESMPL_ATTN_OUT: ("resampler.attn.out_proj",),
+        MODEL_TENSOR.V_RESMPL_KV: ("resampler.kv_proj",),
+        MODEL_TENSOR.V_RESMPL_POST_NORM: ("resampler.ln_post",),
+        MODEL_TENSOR.V_RESMPL_KV_NORM: ("resampler.ln_kv",),
+        MODEL_TENSOR.V_RESMPL_Q_NORM: ("resampler.ln_q",),
+        MODEL_TENSOR.V_RESMPL_PROJ: ("resampler.proj",),
+        MODEL_TENSOR.V_RESMPL_QUERY: ("resampler.query",),
+        MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: (
+            "v.token_embd.img_break",  # for pixtral, this is a generated vector
+        ),
+        MODEL_TENSOR.V_MM_PATCH_MERGER: (
+            "multi_modal_projector.patch_merger.merging_layer",  # mistral small 3.1
+        ),
+    }
+
+    # architecture-specific block mappings
+    arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
+        MODEL_ARCH.ARCTIC: {
+            MODEL_TENSOR.FFN_NORM: ("model.layers.{bid}.residual_layernorm",),
+            MODEL_TENSOR.FFN_NORM_EXP: ("model.layers.{bid}.post_attention_layernorm",),
+        },
+    }
+
+    mapping: dict[str, tuple[MODEL_TENSOR, str]]
+
+    def __init__(self, arch: MODEL_ARCH, n_blocks: int):
+        self.mapping = {}
+        for tensor, keys in self.mappings_cfg.items():
+            if tensor not in MODEL_TENSORS[arch]:
+                continue
+            tensor_name = TENSOR_NAMES[tensor]
+            self.mapping[tensor_name] = (tensor, tensor_name)
+            for key in keys:
+                self.mapping[key] = (tensor, tensor_name)
+        if arch in self.arch_block_mappings_cfg:
+            self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch])
+        for bid in range(n_blocks):
+            for tensor, keys in self.block_mappings_cfg.items():
+                if tensor not in MODEL_TENSORS[arch]:
+                    continue
+
+                tensor_name = TENSOR_NAMES[tensor].format(bid=bid)
+                self.mapping[tensor_name] = (tensor, tensor_name)
+                for key in keys:
+                    key = key.format(bid=bid)
+                    self.mapping[key] = (tensor, tensor_name)
+
+    def get_type_and_name(
+        self, key: str, try_suffixes: Sequence[str] = ()
+    ) -> tuple[MODEL_TENSOR, str] | None:
+        result = self.mapping.get(key)
+        if result is not None:
+            return result
+        for suffix in try_suffixes:
+            if key.endswith(suffix):
+                result = self.mapping.get(key[: -len(suffix)])
+                if result is not None:
+                    return result[0], result[1] + suffix
+        return None
+
+    def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
+        result = self.get_type_and_name(key, try_suffixes=try_suffixes)
+        if result is None:
+            return None
+        return result[1]
+
+    def get_type(
+        self, key: str, try_suffixes: Sequence[str] = ()
+    ) -> MODEL_TENSOR | None:
+        result = self.get_type_and_name(key, try_suffixes=try_suffixes)
+        if result is None:
+            return None
+        return result[0]
+
+    def __getitem__(self, key: str) -> str:
+        try:
+            return self.mapping[key][1]
+        except KeyError:
+            raise KeyError(key)
+
+    def __contains__(self, key: str) -> bool:
+        return key in self.mapping
+
+    def __repr__(self) -> str:
+        return repr(self.mapping)
+
+
+def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
+    return TensorNameMap(arch, n_blocks)
--- a/src/gguf/utility.py
+++ b/src/gguf/utility.py
@ -0,0 +1,316 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Literal
+
+import os
+import json
+
+
+def fill_templated_filename(filename: str, output_type: str | None) -> str:
+    # Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf'
+    ftype_lowercase: str = output_type.lower() if output_type is not None else ""
+    ftype_uppercase: str = output_type.upper() if output_type is not None else ""
+    return filename.format(
+        ftype_lowercase,
+        outtype=ftype_lowercase,
+        ftype=ftype_lowercase,
+        OUTTYPE=ftype_uppercase,
+        FTYPE=ftype_uppercase,
+    )
+
+
+def model_weight_count_rounded_notation(
+    model_params_count: int, min_digits: int = 2
+) -> str:
+    if model_params_count > 1e12:
+        # Trillions Of Parameters
+        scaled_model_params = model_params_count * 1e-12
+        scale_suffix = "T"
+    elif model_params_count > 1e9:
+        # Billions Of Parameters
+        scaled_model_params = model_params_count * 1e-9
+        scale_suffix = "B"
+    elif model_params_count > 1e6:
+        # Millions Of Parameters
+        scaled_model_params = model_params_count * 1e-6
+        scale_suffix = "M"
+    else:
+        # Thousands Of Parameters
+        scaled_model_params = model_params_count * 1e-3
+        scale_suffix = "K"
+
+    fix = max(min_digits - len(str(round(scaled_model_params)).lstrip("0")), 0)
+
+    return f"{scaled_model_params:.{fix}f}{scale_suffix}"
+
+
+def size_label(
+    total_params: int, shared_params: int, expert_params: int, expert_count: int
+) -> str:
+
+    if expert_count > 0:
+        pretty_size = model_weight_count_rounded_notation(
+            abs(shared_params) + abs(expert_params), min_digits=2
+        )
+        size_class = f"{expert_count}x{pretty_size}"
+    else:
+        size_class = model_weight_count_rounded_notation(
+            abs(total_params), min_digits=2
+        )
+
+    return size_class
+
+
+def naming_convention(
+    model_name: str | None,
+    base_name: str | None,
+    finetune_string: str | None,
+    version_string: str | None,
+    size_label: str | None,
+    output_type: str | None,
+    model_type: Literal["vocab", "LoRA"] | None = None,
+) -> str:
+    # Reference: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#gguf-naming-convention
+
+    if base_name is not None:
+        name = base_name.strip().replace(" ", "-").replace("/", "-")
+    elif model_name is not None:
+        name = model_name.strip().replace(" ", "-").replace("/", "-")
+    else:
+        name = "ggml-model"
+
+    parameters = f"-{size_label}" if size_label is not None else ""
+
+    finetune = (
+        f"-{finetune_string.strip().replace(' ', '-')}"
+        if finetune_string is not None
+        else ""
+    )
+
+    version = (
+        f"-{version_string.strip().replace(' ', '-')}"
+        if version_string is not None
+        else ""
+    )
+
+    encoding = (
+        f"-{output_type.strip().replace(' ', '-').upper()}"
+        if output_type is not None
+        else ""
+    )
+
+    kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""
+
+    return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
+
+
+@dataclass
+class RemoteTensor:
+    dtype: str
+    shape: tuple[int, ...]
+    offset_start: int
+    size: int
+    url: str
+
+    def data(self) -> bytearray:
+        # TODO: handle request errors (maybe with limited retries?)
+        # NOTE: using a bytearray, otherwise PyTorch complains the buffer is not writeable
+        data = bytearray(
+            SafetensorRemote.get_data_by_range(
+                url=self.url, start=self.offset_start, size=self.size
+            )
+        )
+        return data
+
+
+class SafetensorRemote:
+    """
+    Uility class to handle remote safetensor files.
+    This class is designed to work with Hugging Face model repositories.
+
+    Example (one model has single safetensor file, the other has multiple):
+        for model_id in ["ngxson/TEST-Tiny-Llama4", "Qwen/Qwen2.5-7B-Instruct"]:
+            tensors = SafetensorRemote.get_list_tensors_hf_model(model_id)
+            print(tensors)
+
+    Example reading tensor data:
+        tensors = SafetensorRemote.get_list_tensors_hf_model(model_id)
+        for name, meta in tensors.items():
+            dtype, shape, offset_start, size, remote_safetensor_url = meta
+            # read the tensor data
+            data = SafetensorRemote.get_data_by_range(remote_safetensor_url, offset_start, size)
+            print(data)
+    """
+
+    BASE_DOMAIN = "https://huggingface.co"
+    ALIGNMENT = 8  # bytes
+
+    @classmethod
+    def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]:
+        """
+        Get list of tensors from a Hugging Face model repository.
+
+        Returns a dictionary of tensor names and their metadata.
+        Each tensor is represented as a tuple of (dtype, shape, offset_start, size, remote_safetensor_url)
+        """
+        # case 1: model has only one single model.safetensor file
+        is_single_file = cls.check_file_exist(
+            f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors"
+        )
+        if is_single_file:
+            url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors"
+            return cls.get_list_tensors(url)
+
+        # case 2: model has multiple files
+        index_url = (
+            f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors.index.json"
+        )
+        is_multiple_files = cls.check_file_exist(index_url)
+        if is_multiple_files:
+            # read the index file
+            index_data = cls.get_data_by_range(index_url, 0)
+            index_str = index_data.decode("utf-8")
+            index_json = json.loads(index_str)
+            assert (
+                index_json.get("weight_map") is not None
+            ), "weight_map not found in index file"
+            weight_map = index_json["weight_map"]
+            # get the list of files
+            all_files = list(set(weight_map.values()))
+            all_files.sort()  # make sure we load shard files in order
+            # get the list of tensors
+            tensors: dict[str, RemoteTensor] = {}
+            for file in all_files:
+                url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/{file}"
+                for key, val in cls.get_list_tensors(url).items():
+                    tensors[key] = val
+            return tensors
+
+        raise ValueError(f"Model {model_id} does not have any safetensor files")
+
+    @classmethod
+    def get_list_tensors(cls, url: str) -> dict[str, RemoteTensor]:
+        """
+        Get list of tensors from a remote safetensor file.
+
+        Returns a dictionary of tensor names and their metadata.
+        Each tensor is represented as a tuple of (dtype, shape, offset_start, size)
+        """
+        metadata, data_start_offset = cls.get_metadata(url)
+        res: dict[str, RemoteTensor] = {}
+
+        for name, meta in metadata.items():
+            if name == "__metadata__":
+                continue
+            if not isinstance(meta, dict):
+                raise ValueError(f"Invalid metadata for tensor '{name}': {meta}")
+            try:
+                dtype = meta["dtype"]
+                shape = meta["shape"]
+                offset_start_relative, offset_end_relative = meta["data_offsets"]
+                size = offset_end_relative - offset_start_relative
+                offset_start = data_start_offset + offset_start_relative
+                res[name] = RemoteTensor(
+                    dtype=dtype,
+                    shape=tuple(shape),
+                    offset_start=offset_start,
+                    size=size,
+                    url=url,
+                )
+            except KeyError as e:
+                raise ValueError(
+                    f"Missing key in metadata for tensor '{name}': {e}, meta = {meta}"
+                )
+
+        return res
+
+    @classmethod
+    def get_metadata(cls, url: str) -> tuple[dict, int]:
+        """
+        Get JSON metadata from a remote safetensor file.
+
+        Returns tuple of (metadata, data_start_offset)
+        """
+        # Request first 5MB of the file (hopefully enough for metadata)
+        read_size = 5 * 1024 * 1024
+        raw_data = cls.get_data_by_range(url, 0, read_size)
+
+        # Parse header
+        # First 8 bytes contain the metadata length as u64 little-endian
+        if len(raw_data) < 8:
+            raise ValueError("Not enough data to read metadata size")
+        metadata_length = int.from_bytes(raw_data[:8], byteorder="little")
+
+        # Calculate the data start offset
+        data_start_offset = 8 + metadata_length
+        alignment = SafetensorRemote.ALIGNMENT
+        if data_start_offset % alignment != 0:
+            data_start_offset += alignment - (data_start_offset % alignment)
+
+        # Check if we have enough data to read the metadata
+        if len(raw_data) < 8 + metadata_length:
+            raise ValueError(
+                f"Could not read complete metadata. Need {8 + metadata_length} bytes, got {len(raw_data)}"
+            )
+
+        # Extract metadata bytes and parse as JSON
+        metadata_bytes = raw_data[8 : 8 + metadata_length]
+        metadata_str = metadata_bytes.decode("utf-8")
+        try:
+            metadata = json.loads(metadata_str)
+            return metadata, data_start_offset
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Failed to parse safetensor metadata as JSON: {e}")
+
+    @classmethod
+    def get_data_by_range(cls, url: str, start: int, size: int = -1) -> bytes:
+        """
+        Get raw byte data from a remote file by range.
+        If size is not specified, it will read the entire file.
+        """
+        import requests
+        from urllib.parse import urlparse
+
+        parsed_url = urlparse(url)
+        if not parsed_url.scheme or not parsed_url.netloc:
+            raise ValueError(f"Invalid URL: {url}")
+
+        headers = cls._get_request_headers()
+        if size > -1:
+            headers["Range"] = f"bytes={start}-{start + size}"
+        response = requests.get(url, allow_redirects=True, headers=headers)
+        response.raise_for_status()
+
+        # Get raw byte data
+        return response.content[:size]
+
+    @classmethod
+    def check_file_exist(cls, url: str) -> bool:
+        """
+        Check if a file exists at the given URL.
+        Returns True if the file exists, False otherwise.
+        """
+        import requests
+        from urllib.parse import urlparse
+
+        parsed_url = urlparse(url)
+        if not parsed_url.scheme or not parsed_url.netloc:
+            raise ValueError(f"Invalid URL: {url}")
+
+        try:
+            headers = cls._get_request_headers()
+            headers["Range"] = "bytes=0-0"
+            response = requests.head(url, allow_redirects=True, headers=headers)
+            # Success (2xx) or redirect (3xx)
+            return 200 <= response.status_code < 400
+        except requests.RequestException:
+            return False
+
+    @classmethod
+    def _get_request_headers(cls) -> dict[str, str]:
+        """Prepare common headers for requests."""
+        headers = {"User-Agent": "convert_hf_to_gguf"}
+        if os.environ.get("HF_TOKEN"):
+            headers["Authorization"] = f"Bearer {os.environ['HF_TOKEN']}"
+        return headers
--- a/src/gguf-py/gguf/vocab.py
+++ b/src/gguf-py/gguf/vocab.py
@ -5,7 +5,16 @@
 import json
 import os
 from pathlib import Path
-from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable
+from typing import (
+    Any,
+    Callable,
+    Sequence,
+    Mapping,
+    Iterable,
+    Protocol,
+    ClassVar,
+    runtime_checkable,
+)

 from sentencepiece import SentencePieceProcessor

@ -23,7 +32,9 @@ class SpecialVocab:
    chat_template: str | Sequence[Mapping[str, str]] | None

    def __init__(
-        self, path: str | os.PathLike[str], load_merges: bool = False,
+        self,
+        path: str | os.PathLike[str],
+        load_merges: bool = False,
        special_token_types: Iterable[str] | None = None,
        n_vocab: int | None = None,
    ):
@ -36,40 +47,60 @@ def __init__(
        if special_token_types is not None:
            self.special_token_types = special_token_types
        else:
-            self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask')
+            self.special_token_types = (
+                "bos",
+                "eos",
+                "unk",
+                "sep",
+                "pad",
+                "cls",
+                "mask",
+            )
        self._load(Path(path))

    def __repr__(self) -> str:
-        return '<SpecialVocab with {} merges, special tokens {}, add special tokens {}>'.format(
-            len(self.merges), self.special_token_ids or "unset", self.add_special_token or "unset",
+        return "<SpecialVocab with {} merges, special tokens {}, add special tokens {}>".format(
+            len(self.merges),
+            self.special_token_ids or "unset",
+            self.add_special_token or "unset",
        )

    def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
        if self.merges:
            if not quiet:
-                logger.info(f'Adding {len(self.merges)} merge(s).')
+                logger.info(f"Adding {len(self.merges)} merge(s).")
            gw.add_token_merges(self.merges)
        elif self.load_merges:
-            logger.warning('Adding merges requested but no merges found, output may be non-functional.')
+            logger.warning(
+                "Adding merges requested but no merges found, output may be non-functional."
+            )
        for typ, tokid in self.special_token_ids.items():
-            id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
+            id_handler: Callable[[int], None] | None = getattr(
+                gw, f"add_{typ}_token_id", None
+            )
            if id_handler is None:
-                logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping')
+                logger.warning(
+                    f"No handler for special token type {typ} with id {tokid} - skipping"
+                )
                continue
            if not quiet:
-                logger.info(f'Setting special token type {typ} to {tokid}')
+                logger.info(f"Setting special token type {typ} to {tokid}")
            id_handler(tokid)
        for typ, value in self.add_special_token.items():
-            add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None)
+            add_handler: Callable[[bool], None] | None = getattr(
+                gw, f"add_add_{typ}_token", None
+            )
            if add_handler is None:
-                logger.warning(f'No handler for add_{typ}_token with value {value} - skipping')
+                logger.warning(
+                    f"No handler for add_{typ}_token with value {value} - skipping"
+                )
                continue
            if not quiet:
-                logger.info(f'Setting add_{typ}_token to {value}')
+                logger.info(f"Setting add_{typ}_token to {value}")
            add_handler(value)
        if self.chat_template is not None:
            if not quiet:
-                logger.info(f'Setting chat_template to {self.chat_template}')
+                logger.info(f"Setting chat_template to {self.chat_template}")
            gw.add_chat_template(self.chat_template)

    def _load(self, path: Path) -> None:
@ -79,12 +110,12 @@ def _load(self, path: Path) -> None:
            self._try_load_merges_txt(path)

    def _try_load_merges_txt(self, path: Path) -> bool:
-        merges_file = path / 'merges.txt'
+        merges_file = path / "merges.txt"
        if not merges_file.is_file():
            return False
-        with open(merges_file, 'r', encoding = 'utf-8') as fp:
-            first_line = next(fp, '').strip()
-            if not first_line.startswith('#'):
+        with open(merges_file, "r", encoding="utf-8") as fp:
+            first_line = next(fp, "").strip()
+            if not first_line.startswith("#"):
                fp.seek(0)
                line_num = 0
            else:
@ -97,9 +128,11 @@ def _try_load_merges_txt(self, path: Path) -> bool:
                    continue
                parts = line.split(None, 3)
                if len(parts) != 2:
-                    logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring')
+                    logger.warning(
+                        f"{merges_file.name}: Line {line_num}: Entry malformed, ignoring"
+                    )
                    continue
-                merges.append(f'{parts[0]} {parts[1]}')
+                merges.append(f"{parts[0]} {parts[1]}")
        self.merges = merges
        return True

@ -107,45 +140,82 @@ def _set_special_token(self, typ: str, tid: Any) -> None:
        if not isinstance(tid, int):
            return
        if tid < 0:
-            raise ValueError(f'invalid value for special token type {typ}: {tid}')
+            raise ValueError(f"invalid value for special token type {typ}: {tid}")
        if self.n_vocab is None or tid < self.n_vocab:
            if typ in self.special_token_ids:
                return
            self.special_token_ids[typ] = tid
            return
-        logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
+        logger.warning(
+            f"Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping"
+        )

    def _try_load_from_tokenizer_json(self, path: Path) -> bool:
-        tokenizer_file = path / 'tokenizer.json'
+        tokenizer_file = path / "tokenizer.json"
        if tokenizer_file.is_file():
-            with open(tokenizer_file, encoding = 'utf-8') as f:
+            with open(tokenizer_file, encoding="utf-8") as f:
                tokenizer = json.load(f)
            if self.load_merges:
-                merges = tokenizer.get('model', {}).get('merges')
-                if isinstance(merges, list) and merges and isinstance(merges[0], str):
-                    self.merges = merges
-            added_tokens = tokenizer.get('added_tokens', {})
+                merges = tokenizer.get("model", {}).get("merges")
+                if isinstance(merges, list) and merges:
+                    if isinstance(merges[0], str):
+                        self.merges = merges
+                    elif (
+                        isinstance(merges[0], list)
+                        and len(merges[0]) == 2
+                        and isinstance(merges[0][0], str)
+                    ):
+                        # New format since transformers 4.45 to support spaces in merges
+                        # ref: https://github.com/ggml-org/llama.cpp/issues/9692
+                        # TODO: internally store as the new format instead of converting to old
+                        if any(" " in s for pair in merges for s in pair):
+                            logger.warning(
+                                f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}'
+                            )
+                        self.merges = [
+                            " ".join(
+                                [
+                                    # ensure the spaces are properly encoded
+                                    "".join(
+                                        chr(ord(c) + 256) if c == " " else c
+                                        for c in part
+                                    )
+                                    for part in pair
+                                ]
+                            )
+                            for pair in merges
+                        ]
+                    else:
+                        raise ValueError("Unknown tokenizer merges format")
+            added_tokens = tokenizer.get("added_tokens", {})
        else:
            added_tokens = {}
-        tokenizer_config_file = path / 'tokenizer_config.json'
+        tokenizer_config_file = path / "tokenizer_config.json"
        if not tokenizer_config_file.is_file():
            return True
-        with open(tokenizer_config_file, encoding = 'utf-8') as f:
+        with open(tokenizer_config_file, encoding="utf-8") as f:
            tokenizer_config = json.load(f)
-        chat_template = tokenizer_config.get('chat_template')
+        chat_template_alt = None
+        chat_template_file = path / "chat_template.json"
+        if chat_template_file.is_file():
+            with open(chat_template_file, encoding="utf-8") as f:
+                chat_template_alt = json.load(f).get("chat_template")
+        chat_template = tokenizer_config.get("chat_template", chat_template_alt)
        if chat_template is None or isinstance(chat_template, (str, list)):
            self.chat_template = chat_template
        else:
-            logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring')
+            logger.warning(
+                f"Bad type for chat_template field in {tokenizer_config_file!r} - ignoring"
+            )
        for typ in self.special_token_types:
-            add_entry = tokenizer_config.get(f'add_{typ}_token')
+            add_entry = tokenizer_config.get(f"add_{typ}_token")
            if isinstance(add_entry, bool):
                self.add_special_token[typ] = add_entry
-            entry = tokenizer_config.get(f'{typ}_token')
+            entry = tokenizer_config.get(f"{typ}_token")
            if isinstance(entry, str):
                tc_content = entry
            elif isinstance(entry, dict):
-                entry_content = entry.get('content')
+                entry_content = entry.get("content")
                if not isinstance(entry_content, str):
                    continue
                tc_content = entry_content
@ -153,20 +223,24 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
                continue
            # We only need the first match here.
            maybe_token_id = next(
-                (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content),
+                (
+                    atok.get("id")
+                    for atok in added_tokens
+                    if atok.get("content") == tc_content
+                ),
                None,
            )
            self._set_special_token(typ, maybe_token_id)
        return True

    def _try_load_from_config_json(self, path: Path) -> bool:
-        config_file = path / 'config.json'
+        config_file = path / "config.json"
        if not config_file.is_file():
            return False
-        with open(config_file, encoding = 'utf-8') as f:
+        with open(config_file, encoding="utf-8") as f:
            config = json.load(f)
        for typ in self.special_token_types:
-            self._set_special_token(typ, config.get(f'{typ}_token_id'))
+            self._set_special_token(typ, config.get(f"{typ}_token_id"))
        return True


@ -202,54 +276,59 @@ class BpeVocab(Vocab):
    def __init__(self, base_path: Path):
        added_tokens: dict[str, int] = {}

-        if (fname_tokenizer := base_path / 'vocab.json').exists():
+        if (fname_tokenizer := base_path / "vocab.json").exists():
            # "slow" tokenizer
            with open(fname_tokenizer, encoding="utf-8") as f:
                self.vocab = json.load(f)

            try:
                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
-                with open(base_path / 'added_tokens.json', encoding="utf-8") as f:
+                with open(base_path / "added_tokens.json", encoding="utf-8") as f:
                    added_tokens = json.load(f)
            except FileNotFoundError:
                pass
        else:
            # "fast" tokenizer
-            fname_tokenizer = base_path / 'tokenizer.json'
+            fname_tokenizer = base_path / "tokenizer.json"

            # if this fails, FileNotFoundError propagates to caller
            with open(fname_tokenizer, encoding="utf-8") as f:
                tokenizer_json = json.load(f)

-            tokenizer_model: dict[str, Any] = tokenizer_json['model']
+            tokenizer_model: dict[str, Any] = tokenizer_json["model"]
            if (
-                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
-                or tokenizer_json['decoder']['type'] != 'ByteLevel'
+                tokenizer_model["type"] != "BPE"
+                or tokenizer_model.get("byte_fallback", False)
+                or tokenizer_json["decoder"]["type"] != "ByteLevel"
            ):
-                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
+                raise FileNotFoundError("Cannot find GPT-2 BPE tokenizer")

            self.vocab = tokenizer_model["vocab"]

-            if (added := tokenizer_json.get('added_tokens')) is not None:
+            if (added := tokenizer_json.get("added_tokens")) is not None:
                # Added tokens here can be duplicates of the main vocabulary.
-                added_tokens = {item['content']: item['id']
-                                for item in added
-                                if item['content'] not in self.vocab}
+                added_tokens = {
+                    item["content"]: item["id"]
+                    for item in added
+                    if item["content"] not in self.vocab
+                }

-        vocab_size   = len(self.vocab)
+        vocab_size = len(self.vocab)
        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids   = sorted(added_tokens.values())
+        actual_ids = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
-                             f"{vocab_size} - {expected_end_id}; got {actual_ids}")
+            raise ValueError(
+                f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
+                f"{vocab_size} - {expected_end_id}; got {actual_ids}"
+            )

        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_dict    = added_tokens
-        self.added_tokens_list    = [text for (text, idx) in items]
-        self.vocab_size_base      = vocab_size
-        self.vocab_size           = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer      = fname_tokenizer
+        self.added_tokens_dict = added_tokens
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base = vocab_size
+        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer

    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@ -276,40 +355,44 @@ class SentencePieceVocab(Vocab):

    def __init__(self, base_path: Path):
        added_tokens: dict[str, int] = {}
-        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
+        if (fname_tokenizer := base_path / "tokenizer.model").exists():
            # normal location
            try:
-                with open(base_path / 'added_tokens.json', encoding="utf-8") as f:
+                with open(base_path / "added_tokens.json", encoding="utf-8") as f:
                    added_tokens = json.load(f)
            except FileNotFoundError:
                pass
-        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
+        elif not (fname_tokenizer := base_path.parent / "tokenizer.model").exists():
            # not found in alternate location either
-            raise FileNotFoundError('Cannot find tokenizer.model')
+            raise FileNotFoundError("Cannot find tokenizer.model")

        self.sentencepiece_tokenizer = SentencePieceProcessor()
        self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
        vocab_size = self.sentencepiece_tokenizer.vocab_size()

-        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
+        new_tokens = {
+            id: piece for piece, id in added_tokens.items() if id >= vocab_size
+        }
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
-        actual_new_ids   = sorted(new_tokens.keys())
+        actual_new_ids = sorted(new_tokens.keys())

        if expected_new_ids != actual_new_ids:
-            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
+            raise ValueError(
+                f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}"
+            )

        # Token pieces that were added to the base vocabulary.
-        self.added_tokens_dict  = added_tokens
-        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
-        self.vocab_size_base    = vocab_size
-        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer    = fname_tokenizer
+        self.added_tokens_dict = added_tokens
+        self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
+        self.vocab_size_base = vocab_size
+        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer

    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.sentencepiece_tokenizer
        for i in range(tokenizer.vocab_size()):
            piece = tokenizer.IdToPiece(i)
-            text         = piece.encode("utf-8")
+            text = piece.encode("utf-8")
            score: float = tokenizer.GetScore(i)

            toktype = gguf.TokenType.NORMAL
@ -347,25 +430,27 @@ class LlamaHfVocab(Vocab):
    name = "hfft"

    def __init__(self, base_path: Path):
-        fname_tokenizer = base_path / 'tokenizer.json'
+        fname_tokenizer = base_path / "tokenizer.json"
        # if this fails, FileNotFoundError propagates to caller
-        with open(fname_tokenizer, encoding='utf-8') as f:
+        with open(fname_tokenizer, encoding="utf-8") as f:
            tokenizer_json = json.load(f)

        # pre-check so we know if we need transformers
-        tokenizer_model: dict[str, Any] = tokenizer_json['model']
+        tokenizer_model: dict[str, Any] = tokenizer_json["model"]
        is_llama3 = (
-            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
-            and not tokenizer_model.get('byte_fallback', True)
+            tokenizer_model["type"] == "BPE"
+            and tokenizer_model.get("ignore_merges", False)
+            and not tokenizer_model.get("byte_fallback", True)
        )
        if is_llama3:
-            raise TypeError('Llama 3 must be converted with BpeVocab')
+            raise TypeError("Llama 3 must be converted with BpeVocab")

        if not is_llama3 and (
-            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
-            or tokenizer_json['decoder']['type'] != 'Sequence'
+            tokenizer_model["type"] != "BPE"
+            or not tokenizer_model.get("byte_fallback", False)
+            or tokenizer_json["decoder"]["type"] != "Sequence"
        ):
-            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
+            raise FileNotFoundError("Cannot find Llama BPE tokenizer")

        try:
            from transformers import AutoTokenizer
@ -387,7 +472,7 @@ def __init__(self, base_path: Path):
        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
        self.added_tokens_dict = dict()
-        self.added_tokens_ids  = set()
+        self.added_tokens_ids = set()

        # Process added tokens
        for tok, tokidx in sorted(
@ -408,7 +493,7 @@ def __init__(self, base_path: Path):

        # Set vocabulary sizes
        self.vocab_size_base = self.tokenizer.vocab_size
-        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
+        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)

        self.fname_tokenizer = fname_tokenizer

@ -427,16 +512,22 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:

            # Yield token text, score, and type
            yield token_text, self.get_token_score(token_id), self.get_token_type(
-                token_id, token_text, self.special_ids  # Reuse already stored special IDs
+                token_id,
+                token_text,
+                self.special_ids,  # Reuse already stored special IDs
            )

-    def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
+    def get_token_type(
+        self, token_id: int, token_text: bytes, special_ids: set[int]
+    ) -> gguf.TokenType:
        # Special case for byte tokens
-        if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
+        if re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text):
            return gguf.TokenType.BYTE

        # Determine token type based on whether it's a special token
-        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
+        return (
+            gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
+        )

    def get_token_score(self, token_id: int) -> float:
        # Placeholder for actual logic to determine the token's score
@ -446,7 +537,9 @@ def get_token_score(self, token_id: int) -> float:
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            if text in self.specials:
-                toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
+                toktype = self.get_token_type(
+                    self.specials[text], b"", self.special_ids
+                )
                score = self.get_token_score(self.specials[text])
            else:
                toktype = gguf.TokenType.USER_DEFINED
--- a/src/globals.py
+++ b/src/globals.py
@ -0,0 +1,123 @@
+import os
+import re
+import sys
+from typing import Any, IO, List, TextIO, Union
+
+from PySide6.QtWidgets import (
+    QMessageBox,
+)
+
+from Localizations import (
+    DOTENV_FILE_NOT_FOUND,
+    COULD_NOT_PARSE_LINE,
+    ERROR_LOADING_DOTENV,
+    AUTOGGUF_VERSION,
+)
+
+
+def verify_gguf(file_path) -> bool:
+    try:
+        with open(file_path, "rb") as f:
+            magic = f.read(4)
+            return magic == b"GGUF"
+    except (FileNotFoundError, IOError, OSError):
+        return False
+
+
+def process_args(args: List[str]) -> bool:
+    try:
+        i = 1
+        while i < len(args):
+            key = (
+                args[i][2:].replace("-", "_").upper()
+            )  # Strip the first two '--' and replace '-' with '_'
+            if i + 1 < len(args) and not args[i + 1].startswith("--"):
+                value = args[i + 1]
+                i += 2
+            else:
+                value = "enabled"
+                i += 1
+            os.environ[key] = value
+        return True
+    except Exception:
+        return False
+
+
+def load_dotenv(self=Any) -> None:
+    if not os.path.isfile(".env"):
+        self.logger.warning(DOTENV_FILE_NOT_FOUND)
+        return
+
+    try:
+        with open(".env") as f:
+            for line in f:
+                # Strip leading/trailing whitespace
+                line = line.strip()
+
+                # Ignore comments and empty lines
+                if not line or line.startswith("#"):
+                    continue
+
+                # Match key-value pairs (unquoted and quoted values)
+                match = re.match(r"^([^=]+)=(.*)$", line)
+                if not match:
+                    self.logger.warning(COULD_NOT_PARSE_LINE.format(line))
+                    continue
+
+                key, value = match.groups()
+
+                # Remove any surrounding quotes from the value
+                if value.startswith(("'", '"')) and value.endswith(("'", '"')):
+                    value = value[1:-1]
+
+                # Decode escape sequences
+                value = bytes(value, "utf-8").decode("unicode_escape")
+
+                # Set the environment variable
+                os.environ[key.strip()] = value.strip()
+    except Exception as e:
+        self.logger.error(ERROR_LOADING_DOTENV.format(e))
+
+
+def show_about(self) -> None:
+    about_text = f"""AutoGGUF
+
+Version: {AUTOGGUF_VERSION}
+        
+A tool for managing and converting GGUF models.
+This application is licensed under the Apache License 2.0.
+Copyright (c) 2024-2025 leafspark.
+It also utilizes llama.cpp, licensed under the MIT License.
+Copyright (c) 2023-2025 The ggml authors."""
+    QMessageBox.about(self, "About AutoGGUF", about_text)
+
+
+def ensure_directory(path) -> None:
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+def open_file_safe(file_path, mode="r") -> IO[Any]:
+    encodings = ["utf-8", "latin-1", "ascii", "utf-16"]
+    for encoding in encodings:
+        try:
+            return open(file_path, mode, encoding=encoding)
+        except UnicodeDecodeError:
+            continue
+    raise ValueError(
+        f"Unable to open file {file_path} with any of the encodings: {encodings}"
+    )
+
+
+def resource_path(relative_path) -> Union[str, str, bytes]:
+    if hasattr(sys, "_MEIPASS"):
+        # PyInstaller path
+        base_path = sys._MEIPASS
+    elif "__compiled__" in globals():
+        # Nuitka path
+        base_path = os.path.dirname(sys.executable)
+    else:
+        # Regular Python path
+        base_path = os.path.abspath(".")
+
+    return os.path.join(base_path, relative_path)
--- a/src/imports_and_globals.py
+++ b/src/imports_and_globals.py
@ -1,39 +0,0 @@
-import os
-import sys
-import psutil
-import subprocess
-import time
-import signal
-import json
-import platform
-import requests
-import zipfile
-from datetime import datetime
-from PyQt6.QtWidgets import (QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, QWidget, QPushButton, 
-                             QListWidget, QLineEdit, QLabel, QFileDialog, QProgressBar, QComboBox, QTextEdit,
-                             QCheckBox, QGroupBox, QFormLayout, QScrollArea, QSlider, QSpinBox, QListWidgetItem,
-                             QMessageBox, QDialog, QPlainTextEdit, QMenu)
-from PyQt6.QtCore import QTimer, QThread, pyqtSignal, Qt, QSize
-from PyQt6.QtGui import QCloseEvent, QAction
-
-def ensure_directory(path):
-    if not os.path.exists(path):
-        os.makedirs(path)
-        
-def open_file_safe(file_path, mode='r'):
-    encodings = ['utf-8', 'latin-1', 'ascii', 'utf-16']
-    for encoding in encodings:
-        try:
-            return open(file_path, mode, encoding=encoding)
-        except UnicodeDecodeError:
-            continue
-    raise ValueError(f"Unable to open file {file_path} with any of the encodings: {encodings}")        
-    
-def resource_path(relative_path):
-    try:
-        # PyInstaller creates a temp folder and stores path in _MEIPASS
-        base_path = sys._MEIPASS
-    except Exception:
-        base_path = os.path.abspath(".")
-
-    return os.path.join(base_path, relative_path)    
--- a/src/lora_conversion.py
+++ b/src/lora_conversion.py
@ -0,0 +1,226 @@
+from datetime import datetime
+
+from PySide6.QtWidgets import (
+    QFileDialog,
+    QHBoxLayout,
+    QLineEdit,
+    QListWidgetItem,
+    QPushButton,
+    QWidget,
+)
+
+from QuantizationThread import QuantizationThread
+from TaskListItem import TaskListItem
+from error_handling import handle_error, show_error
+from globals import ensure_directory
+from Localizations import *
+
+
+def export_lora(self) -> None:
+    self.logger.info(STARTING_LORA_EXPORT)
+    try:
+        model_path = self.export_lora_model.text()
+        output_path = self.export_lora_output.text()
+        lora_adapters = []
+
+        for i in range(self.export_lora_adapters.count()):
+            item = self.export_lora_adapters.item(i)
+            adapter_widget = self.export_lora_adapters.itemWidget(item)
+            path_input = adapter_widget.layout().itemAt(0).widget()
+            scale_input = adapter_widget.layout().itemAt(1).widget()
+            adapter_path = path_input.text()
+            adapter_scale = scale_input.text()
+            lora_adapters.append((adapter_path, adapter_scale))
+
+        if not model_path:
+            raise ValueError(MODEL_PATH_REQUIRED)
+        if not output_path:
+            raise ValueError(OUTPUT_PATH_REQUIRED)
+        if not lora_adapters:
+            raise ValueError(AT_LEAST_ONE_LORA_ADAPTER_REQUIRED)
+
+        backend_path = self.backend_combo.currentData()
+        if not backend_path:
+            raise ValueError(NO_BACKEND_SELECTED)
+
+        command = [
+            os.path.join(backend_path, "llama-export-lora"),
+            "--model",
+            model_path,
+            "--output",
+            output_path,
+        ]
+
+        for adapter_path, adapter_scale in lora_adapters:
+            if adapter_path:
+                if adapter_scale:
+                    try:
+                        scale_value = float(adapter_scale)
+                        command.extend(
+                            ["--lora-scaled", adapter_path, str(scale_value)]
+                        )
+                    except ValueError:
+                        raise ValueError(INVALID_LORA_SCALE_VALUE)
+                else:
+                    command.extend(["--lora", adapter_path])
+
+        threads = self.export_lora_threads.value()
+        command.extend(["--threads", str(threads)])
+
+        logs_path = self.logs_input.text()
+        ensure_directory(logs_path)
+
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        log_file = os.path.join(logs_path, f"lora_export_{timestamp}.log")
+
+        command_str = " ".join(command)
+        self.logger.info(f"{LORA_EXPORT_COMMAND}: {command_str}")
+
+        thread = QuantizationThread(command, backend_path, log_file)
+        self.quant_threads.append(thread)
+
+        task_item = TaskListItem(EXPORTING_LORA, log_file, show_progress_bar=False)
+        list_item = QListWidgetItem(self.task_list)
+        list_item.setSizeHint(task_item.sizeHint())
+        self.task_list.addItem(list_item)
+        self.task_list.setItemWidget(list_item, task_item)
+
+        thread.status_signal.connect(task_item.update_status)
+        thread.finished_signal.connect(lambda: self.task_finished(thread))
+        thread.error_signal.connect(
+            lambda err: handle_error(self.logger, err, task_item)
+        )
+        thread.start()
+        self.logger.info(LORA_EXPORT_TASK_STARTED)
+    except ValueError as e:
+        show_error(self.logger, str(e))
+    except Exception as e:
+        show_error(self.logger, ERROR_STARTING_LORA_EXPORT.format(str(e)))
+
+
+def lora_conversion_finished(self, thread) -> None:
+    self.logger.info(LORA_CONVERSION_FINISHED)
+    if thread in self.quant_threads:
+        self.quant_threads.remove(thread)
+
+
+def delete_lora_adapter_item(self, adapter_widget) -> None:
+    self.logger.info(DELETING_LORA_ADAPTER)
+    # Find the QListWidgetItem containing the adapter_widget
+    for i in range(self.export_lora_adapters.count()):
+        item = self.export_lora_adapters.item(i)
+        if self.export_lora_adapters.itemWidget(item) == adapter_widget:
+            self.export_lora_adapters.takeItem(i)  # Remove the item
+            break
+
+
+def browse_export_lora_model(self) -> None:
+    self.logger.info(BROWSING_FOR_EXPORT_LORA_MODEL_FILE)
+    model_file, _ = QFileDialog.getOpenFileName(self, SELECT_MODEL_FILE, "", GGUF_FILES)
+    if model_file:
+        self.export_lora_model.setText(os.path.abspath(model_file))
+
+
+def browse_export_lora_output(self) -> None:
+    self.logger.info(BROWSING_FOR_EXPORT_LORA_OUTPUT_FILE)
+    output_file, _ = QFileDialog.getSaveFileName(
+        self, SELECT_OUTPUT_FILE, "", GGUF_FILES
+    )
+    if output_file:
+        self.export_lora_output.setText(os.path.abspath(output_file))
+
+
+def add_lora_adapter(self) -> None:
+    self.logger.info(ADDING_LORA_ADAPTER)
+    adapter_path, _ = QFileDialog.getOpenFileName(
+        self, SELECT_LORA_ADAPTER_FILE, "", LORA_FILES
+    )
+    if adapter_path:
+        # Create a widget to hold the path and scale input
+        adapter_widget = QWidget()
+        adapter_layout = QHBoxLayout(adapter_widget)
+
+        path_input = QLineEdit(adapter_path)
+        path_input.setReadOnly(True)
+        adapter_layout.addWidget(path_input)
+
+        scale_input = QLineEdit("1.0")  # Default scale value
+        adapter_layout.addWidget(scale_input)
+
+        delete_button = QPushButton(DELETE_ADAPTER)
+        delete_button.clicked.connect(
+            lambda: self.delete_lora_adapter_item(adapter_widget)
+        )
+        adapter_layout.addWidget(delete_button)
+
+        # Add the widget to the list
+        list_item = QListWidgetItem(self.export_lora_adapters)
+        list_item.setSizeHint(adapter_widget.sizeHint())
+        self.export_lora_adapters.addItem(list_item)
+        self.export_lora_adapters.setItemWidget(list_item, adapter_widget)
+
+
+def convert_lora(self) -> None:
+    self.logger.info(STARTING_LORA_CONVERSION)
+    try:
+        lora_input_path = self.lora_input.text()
+        lora_output_path = self.lora_output.text()
+        lora_output_type = self.lora_output_type_combo.currentText()
+
+        if not lora_input_path:
+            raise ValueError(LORA_INPUT_PATH_REQUIRED)
+        if not lora_output_path:
+            raise ValueError(LORA_OUTPUT_PATH_REQUIRED)
+
+        if lora_output_type == "GGUF":  # Use new file and parameters for GGUF
+            command = [
+                "python",
+                "src/convert_lora_to_gguf.py",
+                "--outfile",
+                lora_output_path,
+                lora_input_path,
+            ]
+            base_model_path = self.base_model_path.text()
+            if not base_model_path:
+                raise ValueError(BASE_MODEL_PATH_REQUIRED)
+            command.extend(["--base", base_model_path])
+        else:  # Use old GGML parameters for GGML
+            command = [
+                "python",
+                "src/convert_lora_to_ggml.py",
+                lora_input_path,
+                lora_output_path,
+            ]
+
+        logs_path = self.logs_input.text()
+        ensure_directory(logs_path)
+
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        log_file = os.path.join(logs_path, f"lora_conversion_{timestamp}.log")
+
+        command_str = " ".join(command)
+        self.logger.info(f"{LORA_CONVERSION_COMMAND}: {command_str}")
+
+        thread = QuantizationThread(command, os.getcwd(), log_file)
+        self.quant_threads.append(thread)
+
+        task_name = LORA_CONVERSION_FROM_TO.format(
+            os.path.basename(lora_input_path), os.path.basename(lora_output_path)
+        )
+        task_item = TaskListItem(task_name, log_file, show_progress_bar=False)
+        list_item = QListWidgetItem(self.task_list)
+        list_item.setSizeHint(task_item.sizeHint())
+        self.task_list.addItem(list_item)
+        self.task_list.setItemWidget(list_item, task_item)
+
+        thread.status_signal.connect(task_item.update_status)
+        thread.finished_signal.connect(lambda: self.lora_conversion_finished(thread))
+        thread.error_signal.connect(
+            lambda err: handle_error(self.logger, err, task_item)
+        )
+        thread.start()
+        self.logger.info(LORA_CONVERSION_TASK_STARTED)
+    except ValueError as e:
+        show_error(self.logger, str(e))
+    except Exception as e:
+        show_error(self.logger, ERROR_STARTING_LORA_CONVERSION.format(str(e)))
--- a/src/main.py
+++ b/src/main.py
@ -1,9 +1,196 @@
-import sys
-from PyQt6.QtWidgets import QApplication
-from AutoGGUF import AutoGGUF
-
-if __name__ == "__main__":
-    app = QApplication(sys.argv)
-    window = AutoGGUF()
-    window.show()
-    sys.exit(app.exec())
+import os
+import sys
+import threading
+from enum import Enum
+from typing import List, Optional
+
+from PySide6.QtCore import QTimer
+from PySide6.QtWidgets import QApplication
+from fastapi import FastAPI, Query, Depends, HTTPException, Security
+from fastapi.security.api_key import APIKeyHeader
+from pydantic import BaseModel, Field
+from uvicorn import Config, Server
+
+from AutoGGUF import AutoGGUF
+from Localizations import AUTOGGUF_VERSION
+
+app = FastAPI(
+    title="AutoGGUF",
+    description="API for AutoGGUF - automatically quant GGUF models",
+    version=AUTOGGUF_VERSION,
+    license_info={
+        "name": "Apache 2.0",
+        "url": "https://raw.githubusercontent.com/leafspark/AutoGGUF/main/LICENSE",
+    },
+)
+
+# Global variable to hold the window reference
+window = None
+
+
+class ModelType(str, Enum):
+    single = "single"
+    sharded = "sharded"
+
+
+class Model(BaseModel):
+    name: str = Field(..., description="Name of the model")
+    type: str = Field(..., description="Type of the model")
+    path: str = Field(..., description="Path to the model file")
+    size: Optional[int] = Field(None, description="Size of the model in bytes")
+
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "name": "Llama-3.1-8B-Instruct.fp16.gguf",
+                "type": "single",
+                "path": "Llama-3.1-8B-Instruct.fp16.gguf",
+                "size": 13000000000,
+            }
+        }
+
+
+class Task(BaseModel):
+    # id: str = Field(..., description="Unique identifier for the task")
+    status: str = Field(..., description="Current status of the task")
+    progress: float = Field(..., description="Progress of the task as a percentage")
+
+    class Config:
+        json_json_schema_extra = {
+            "example": {"id": "task_123", "status": "running", "progress": 75.5}
+        }
+
+
+class Backend(BaseModel):
+    name: str = Field(..., description="Name of the backend")
+    path: str = Field(..., description="Path to the backend executable")
+
+
+class Plugin(BaseModel):
+    name: str = Field(..., description="Name of the plugin")
+    version: str = Field(..., description="Version of the plugin")
+    description: str = Field(..., description="Description of the plugin")
+    author: str = Field(..., description="Author of the plugin")
+
+
+# API Key configuration
+API_KEY_NAME = "Authorization"
+api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
+
+
+def get_api_key(
+    api_key_header: str = Security(api_key_header),
+) -> Optional[str]:
+    api_key_env = os.getenv("AUTOGGUF_SERVER_API_KEY")
+    if not api_key_env:
+        return None  # No API key restriction if not set
+
+    api_keys = [
+        key.strip() for key in api_key_env.split(",") if key.strip()
+    ]  # Split by comma and strip whitespace
+
+    if api_key_header and api_key_header.startswith("Bearer "):
+        api_key = api_key_header[len("Bearer ") :]
+        if api_key in api_keys:
+            return api_key
+
+    raise HTTPException(status_code=403, detail="Could not validate API key")
+
+
+@app.get(
+    "/v1/models",
+    response_model=List[Model],
+    tags=["Models"],
+    dependencies=[Depends(get_api_key)],
+)
+async def get_models(
+    type: Optional[ModelType] = Query(None, description="Filter models by type")
+) -> List[Model]:
+    if window:
+        models = window.get_models_data()
+        if type:
+            models = [m for m in models if m["type"] == type]
+
+        return [Model(**m) for m in models]
+    return []
+
+
+@app.get(
+    "/v1/tasks",
+    response_model=List[Task],
+    tags=["Tasks"],
+    dependencies=[Depends(get_api_key)],
+)
+async def get_tasks() -> List[Task]:
+    if window:
+        return window.get_tasks_data()
+    return []
+
+
+@app.get("/v1/health", tags=["System"], dependencies=[Depends(get_api_key)])
+async def health_check() -> dict:
+    return {"status": "alive"}
+
+
+@app.get(
+    "/v1/backends",
+    response_model=List[Backend],
+    tags=["System"],
+    dependencies=[Depends(get_api_key)],
+)
+async def get_backends() -> List[Backend]:
+    backends = []
+    if window:
+        for i in range(window.backend_combo.count()):
+            backends.append(
+                Backend(
+                    name=window.backend_combo.itemText(i),
+                    path=window.backend_combo.itemData(i),
+                )
+            )
+    return backends
+
+
+@app.get(
+    "/v1/plugins",
+    response_model=List[Plugin],
+    tags=["System"],
+    dependencies=[Depends(get_api_key)],
+)
+async def get_plugins() -> List[Plugin]:
+    if window:
+        return [
+            Plugin(**plugin_data["data"]) for plugin_data in window.plugins.values()
+        ]
+    return []
+
+
+def run_uvicorn() -> None:
+    if os.environ.get("AUTOGGUF_SERVER", "").lower() == "enabled":
+        config = Config(
+            app=app,
+            host="127.0.0.1",
+            port=int(os.environ.get("AUTOGGUF_SERVER_PORT", 7001)),
+            log_level="info",
+        )
+        server = Server(config)
+        server.run()
+
+
+def main() -> None:
+    global window
+    qt_app = QApplication(sys.argv)
+    window = AutoGGUF(sys.argv)
+    window.show()
+
+    # Start Uvicorn in a separate thread after a short delay
+    timer = QTimer()
+    timer.singleShot(
+        100, lambda: threading.Thread(target=run_uvicorn, daemon=True).start()
+    )
+
+    sys.exit(qt_app.exec())
+
+
+if __name__ == "__main__":
+    main()
--- a/src/presets.py
+++ b/src/presets.py
@ -0,0 +1,118 @@
+import json
+
+from PySide6.QtCore import Qt
+from PySide6.QtWidgets import QApplication, QFileDialog, QMessageBox
+from Localizations import (
+    SAVING_PRESET,
+    SAVE_PRESET,
+    JSON_FILES,
+    PRESET_SAVED,
+    PRESET_SAVED_TO,
+    LOADING_PRESET,
+    LOAD_PRESET,
+    PRESET_LOADED,
+    PRESET_LOADED_FROM,
+)
+
+
+def save_preset(self) -> None:
+    self.logger.info(SAVING_PRESET)
+    preset = {
+        "quant_types": [item.text() for item in self.quant_type.selectedItems()],
+        "allow_requantize": self.allow_requantize.isChecked(),
+        "leave_output_tensor": self.leave_output_tensor.isChecked(),
+        "pure": self.pure.isChecked(),
+        "imatrix": self.imatrix.text(),
+        "include_weights": self.include_weights.text(),
+        "exclude_weights": self.exclude_weights.text(),
+        "use_output_tensor_type": self.use_output_tensor_type.isChecked(),
+        "output_tensor_type": self.output_tensor_type.currentText(),
+        "use_token_embedding_type": self.use_token_embedding_type.isChecked(),
+        "token_embedding_type": self.token_embedding_type.currentText(),
+        "keep_split": self.keep_split.isChecked(),
+        "kv_overrides": [
+            entry.get_raw_override_string() for entry in self.kv_override_entries
+        ],
+        "extra_arguments": self.extra_arguments.text(),
+    }
+
+    if not QApplication.keyboardModifiers() & Qt.ShiftModifier:
+        file_name, _ = QFileDialog.getSaveFileName(self, SAVE_PRESET, "", JSON_FILES)
+        if file_name:
+            with open(file_name, "w") as f:
+                json.dump(preset, f, indent=4)
+            QMessageBox.information(
+                self, PRESET_SAVED, PRESET_SAVED_TO.format(file_name)
+            )
+        self.logger.info(PRESET_SAVED_TO.format(file_name))
+    else:
+        clipboard = QApplication.clipboard()
+        preset_str = json.dumps(preset, indent=1)
+        clipboard.setText(preset_str)
+        QMessageBox.information(self, PRESET_SAVED, "Preset copied to clipboard")
+        self.logger.info("Preset copied to clipboard")
+
+
+def load_preset(self) -> None:
+    self.logger.info(LOADING_PRESET)
+
+    try:
+        if QApplication.keyboardModifiers() & Qt.ShiftModifier:
+            clipboard = QApplication.clipboard()
+            preset = json.loads(clipboard.text())
+            source = "clipboard"
+        else:
+            file_name, _ = QFileDialog.getOpenFileName(
+                self, LOAD_PRESET, "", JSON_FILES
+            )
+            if not file_name:
+                return
+            with open(file_name, "r") as f:
+                preset = json.load(f)
+            source = file_name
+
+        self.quant_type.clearSelection()
+        for quant_type in preset.get("quant_types", []):
+            items = self.quant_type.findItems(quant_type, Qt.MatchExactly)
+            if items:
+                items[0].setSelected(True)
+        self.allow_requantize.setChecked(preset.get("allow_requantize", False))
+        self.leave_output_tensor.setChecked(preset.get("leave_output_tensor", False))
+        self.pure.setChecked(preset.get("pure", False))
+        self.imatrix.setText(preset.get("imatrix", ""))
+        self.include_weights.setText(preset.get("include_weights", ""))
+        self.exclude_weights.setText(preset.get("exclude_weights", ""))
+        self.use_output_tensor_type.setChecked(
+            preset.get("use_output_tensor_type", False)
+        )
+        self.output_tensor_type.setCurrentText(preset.get("output_tensor_type", ""))
+        self.use_token_embedding_type.setChecked(
+            preset.get("use_token_embedding_type", False)
+        )
+        self.token_embedding_type.setCurrentText(preset.get("token_embedding_type", ""))
+        self.keep_split.setChecked(preset.get("keep_split", False))
+        self.extra_arguments.setText(preset.get("extra_arguments", ""))
+
+        # Clear existing KV overrides and add new ones
+        for entry in self.kv_override_entries:
+            self.remove_kv_override(entry)
+        for override in preset.get("kv_overrides", []):
+            self.add_kv_override(override)
+
+        QMessageBox.information(
+            self,
+            PRESET_LOADED,
+            PRESET_LOADED_FROM.format(
+                source
+                if not QApplication.keyboardModifiers() & Qt.ShiftModifier
+                else "clipboard"
+            ),
+        )
+        self.logger.info(PRESET_LOADED_FROM.format(source))
+
+    except json.JSONDecodeError:
+        QMessageBox.critical(self, "Error", "Invalid JSON in clipboard")
+        self.logger.error("Failed to parse JSON from clipboard")
+    except Exception as e:
+        QMessageBox.critical(self, "Error", f"Failed to load preset: {str(e)}")
+        self.logger.error(f"Failed to load preset: {str(e)}")
--- a/src/quantize_to_fp8_dynamic.py
+++ b/src/quantize_to_fp8_dynamic.py
@ -0,0 +1,559 @@
+import copy
+import gc
+import re
+import sys
+from typing import List
+from typing import Optional, Tuple
+
+import torch
+import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+# https://github.com/neuralmagic/AutoFP8
+
+
+class BaseQuantizeConfig:
+    """Configuration for model quantization.
+
+    Args:
+        quant_method: Type/precision of quantization method to use.
+            At the moment, this is just "fp8" which specifically means
+            the fp8_e4m3 format in pytorch.
+        activation_scheme: Choice of either "dynamic" or "static" quantization
+            of activtions. If "static", then calibration samples are required
+            during quantization to produce accurate per-tensor scales for
+            activations of Linear modules.
+        ignore_patterns: List of patterns used to ignore layers. If a string
+            starts with "re:", then everything afterward is used as python
+            regex style matching i.e. re.search(), for each Linear layer.
+            By default, "re:.*lm_head" is included to ignore the embedding
+            Linear layer usually at the end of decoder LLMs
+        kv_cache_quant_targets: Tuple of Linear module names to target for
+            calibration of the output scales for KV cache quantization.
+            Usually, these should be `("k_proj", "v_proj")`.
+    """
+
+    def __init__(
+        self,
+        quant_method: str = "fp8",
+        activation_scheme: str = "static",
+        ignore_patterns: List[str] = ["re:.*lm_head"],
+        kv_cache_quant_targets: Optional[Tuple[str]] = None,
+    ):
+        if quant_method != "fp8":
+            raise ValueError("Only FP8 quantization is supported.")
+        if activation_scheme not in ["static", "dynamic"]:
+            raise ValueError(
+                "Invalid activation_scheme. Choose either 'static' or 'dynamic'."
+            )
+        self.quant_method = quant_method
+        self.activation_scheme = activation_scheme
+        self.ignore_patterns = ignore_patterns
+        self.kv_cache_quant_targets = kv_cache_quant_targets
+        self.ignored_layers = []
+
+
+# Class responsible for quantizing weights
+class FP8DynamicLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        bias: torch.nn.Parameter,
+    ):
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight, requires_grad=False)
+        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+        self.bias = bias
+
+    def forward(self, x):
+        qinput, x_scale = per_tensor_quantize(x)
+        output = fp8_gemm(
+            A=qinput,
+            A_scale=x_scale,
+            B=self.weight,
+            B_scale=self.weight_scale,
+            bias=self.bias,
+            out_dtype=x.dtype,
+        )
+        return output
+
+
+# Module responsible for taking already quantized weights, and recording input scales (and possibly output scales)
+# using an activation observer
+class FP8StaticLinearQuantizer(torch.nn.Module):
+    def __init__(
+        self,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        bias: torch.nn.Parameter,
+        quantize_output: bool = False,
+    ):
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight, requires_grad=False)
+        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+        self.bias = bias
+        self.input_scale = None
+        self.output_scale = None
+        self.quantize_output = quantize_output
+
+    def forward(self, x):
+        qinput, x_input_scale = per_tensor_quantize(x)
+        if self.input_scale is None:
+            self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
+        elif x_input_scale > self.input_scale:
+            self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
+        output = fp8_gemm(
+            A=qinput,
+            A_scale=self.input_scale,
+            B=self.weight,
+            B_scale=self.weight_scale,
+            bias=self.bias,
+            out_dtype=x.dtype,
+        )
+
+        # Optionally, quantize output and record scale
+        if self.quantize_output:
+            qoutput, output_scale = per_tensor_quantize(output)
+            if self.output_scale is None:
+                self.output_scale = torch.nn.Parameter(
+                    output_scale, requires_grad=False
+                )
+            elif output_scale > self.output_scale:
+                self.output_scale = torch.nn.Parameter(
+                    output_scale, requires_grad=False
+                )
+            output = qoutput.to(output.dtype) * output_scale
+
+        return output
+
+
+# Module responsible for representing the final checkpoint representation
+class FP8StaticLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight: torch.nn.Parameter,
+        weight_scale: torch.nn.Parameter,
+        bias: torch.nn.Parameter,
+        input_scale: torch.nn.Parameter,
+        output_scale: Optional[torch.nn.Parameter] = None,
+    ):
+        super().__init__()
+        self.weight = weight
+        self.weight_scale = weight_scale
+        self.bias = bias
+        self.input_scale = input_scale
+        self.output_scale = output_scale
+
+    def forward(self, x):
+        qinput = static_per_tensor_quantize(x, self.input_scale)
+        output = fp8_gemm(
+            A=qinput,
+            A_scale=self.input_scale,
+            B=self.weight,
+            B_scale=self.weight_scale,
+            bias=self.bias,
+            out_dtype=x.dtype,
+        )
+
+        if self.output_scale:
+            qoutput = static_per_tensor_quantize(output, self.output_scale)
+            output = qoutput.to(output.dtype) * self.output_scale
+
+        return output
+
+
+class AutoFP8ForCausalLM:
+    def __init__(
+        self,
+        model: AutoModelForCausalLM,
+        quantize_config: BaseQuantizeConfig,
+    ):
+        self.model = model
+        self.model_type = self.model.config.model_type
+        self.config = self.model.config
+
+        # Gather the Linear module names that we want to ignore
+        quantize_config.ignored_layers = get_layers_to_ignore(
+            self.model, quantize_config.ignore_patterns
+        )
+
+        if quantize_config.kv_cache_quant_targets:
+            kv_cache_quant_layers = get_kv_cache_quant_layers(
+                self.model, quantize_config.kv_cache_quant_targets
+            )
+            if len(kv_cache_quant_layers) == 0:
+                raise ValueError(
+                    f"Could not find any kv cache layers using kv_cache_quant_targets={quantize_config.kv_cache_quant_targets}, please fix your argument."
+                )
+            quantize_config.kv_cache_quant_layers = kv_cache_quant_layers
+
+        self.quantize_config = quantize_config
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        quantize_config: BaseQuantizeConfig,
+        **model_init_kwargs,
+    ):
+        """Load the un-quantized pretrained model"""
+
+        def skip(*args, **kwargs):
+            pass
+
+        torch.nn.init.kaiming_uniform_ = skip
+        torch.nn.init.uniform_ = skip
+        torch.nn.init.normal_ = skip
+
+        # Parameters related to loading from Hugging Face Hub
+        cache_dir = model_init_kwargs.pop("cache_dir", None)
+        force_download = model_init_kwargs.pop("force_download", False)
+        resume_download = model_init_kwargs.pop("resume_download", False)
+        proxies = model_init_kwargs.pop("proxies", None)
+        local_files_only = model_init_kwargs.pop("local_files_only", False)
+        use_auth_token = model_init_kwargs.pop("use_auth_token", None)
+        revision = model_init_kwargs.pop("revision", None)
+        subfolder = model_init_kwargs.pop("subfolder", "")
+        commit_hash = model_init_kwargs.pop("_commit_hash", None)
+
+        cached_file_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "proxies": proxies,
+            "resume_download": resume_download,
+            "local_files_only": local_files_only,
+            "use_auth_token": use_auth_token,
+            "revision": revision,
+            "subfolder": subfolder,
+            "_commit_hash": commit_hash,
+        }
+
+        torch.cuda.empty_cache()
+
+        # Important defaults
+        if "torch_dtype" not in model_init_kwargs:
+            model_init_kwargs["torch_dtype"] = "auto"
+
+        if "device_map" not in model_init_kwargs:
+            model_init_kwargs["device_map"] = "auto"
+
+        merged_kwargs = {**model_init_kwargs, **cached_file_kwargs}
+        print("Loading model with the following kwargs:", merged_kwargs)
+        model = AutoModelForCausalLM.from_pretrained(
+            pretrained_model_name_or_path, **merged_kwargs
+        )
+
+        model_config = model.config.to_dict()
+        seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
+        if any(k in model_config for k in seq_len_keys):
+            for key in seq_len_keys:
+                if key in model_config:
+                    model.seqlen = model_config[key]
+                    break
+        else:
+            print("Can't get model's sequence length, setting to 2048.")
+            model.seqlen = 2048
+        model.eval()
+
+        return cls(model, quantize_config)
+
+    def quantize(self, calibration_tokens: Optional[torch.Tensor] = None):
+
+        # Always quantize the weights as they do not require calibration data
+        quantize_weights(self.model, self.quantize_config)
+
+        if self.quantize_config.activation_scheme == "static":
+            assert (
+                calibration_tokens is not None
+            ), "Calibration tokens required for activation quantization"
+
+            def _prepare_calibration_data(calibration_tokens):
+                if hasattr(calibration_tokens, "input_ids"):
+                    return calibration_tokens.input_ids
+                return calibration_tokens
+
+            quantize_activations(
+                self.model,
+                self.quantize_config,
+                _prepare_calibration_data(calibration_tokens),
+            )
+
+    def save_quantized(self, save_dir):
+        save_quantized_model(
+            self.model,
+            quant_config=self.quantize_config,
+            save_dir=save_dir,
+        )
+
+
+def cleanup_memory():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
+    """Quantize a tensor using per-tensor static scaling factor.
+    Args:
+        tensor: The input tensor.
+    """
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    # Calculate the scale as dtype max divided by absmax.
+    # Since .abs() creates a new tensor, we use aminmax to get
+    # the min and max first and then calculate the absmax.
+    if tensor.numel() == 0:
+        # Deal with empty tensors (triggered by empty MoE experts)
+        min_val, max_val = (
+            torch.tensor(-16.0, dtype=tensor.dtype),
+            torch.tensor(16.0, dtype=tensor.dtype),
+        )
+    else:
+        min_val, max_val = tensor.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs())
+    scale = finfo.max / amax.clamp(min=1e-12)
+    # Scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    qweight = qweight.to(torch.float8_e4m3fn)
+    scale = scale.float().reciprocal()
+    return qweight, scale
+
+
+def static_per_tensor_quantize(tensor: torch.Tensor, inv_scale: float) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
+    return qweight.to(torch.float8_e4m3fn)
+
+
+def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype):
+    if A.numel() == 0:
+        # Deal with empty tensors (triggeted by empty MoE experts)
+        return torch.empty(size=(0, B.shape[0]), dtype=out_dtype, device=A.device)
+
+    # TODO: Disable native fp8 gemm for now, always just dequantize
+    # native_fp8_support = (
+    #     torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9)
+    # )
+    native_fp8_support = False
+    if native_fp8_support:
+        need_reshape = A.dim() == 3
+        if need_reshape:
+            batch_size = A.shape[0]
+            A_input = A.reshape(-1, A.shape[-1])
+        else:
+            batch_size = None
+            A_input = A
+        output, _ = torch._scaled_mm(
+            A_input,
+            B.t(),
+            out_dtype=out_dtype,
+            scale_a=A_scale,
+            scale_b=B_scale,
+            bias=bias,
+        )
+        if need_reshape:
+            output = output.reshape(
+                batch_size, output.shape[0] // batch_size, output.shape[1]
+            )
+    else:
+        output = torch.nn.functional.linear(
+            A.to(out_dtype) * A_scale,
+            B.to(out_dtype) * B_scale.to(out_dtype),
+            bias=bias,
+        )
+    return output
+
+
+def replace_module(model: AutoModelForCausalLM, name: str, new_module: torch.nn.Module):
+    if "." in name:
+        parent_name = name.rsplit(".", 1)[0]
+        child_name = name[len(parent_name) + 1 :]
+        parent = model.get_submodule(parent_name)
+    else:
+        parent_name = ""
+        parent = model
+        child_name = name
+    setattr(parent, child_name, new_module)
+
+
+def quantize_weights(
+    model: AutoModelForCausalLM,
+    quantize_config: BaseQuantizeConfig,
+):
+    named_modules = list(model.named_modules())
+    for name, linear in tqdm.tqdm(named_modules, desc="Quantizing weights"):
+        if (
+            not isinstance(linear, torch.nn.Linear)
+            or name in quantize_config.ignored_layers
+        ):
+            continue
+        quant_weight, weight_scale = per_tensor_quantize(linear.weight)
+        bias = copy.deepcopy(linear.bias) if linear.bias is not None else None
+        quant_linear = FP8DynamicLinear(
+            weight=quant_weight, weight_scale=weight_scale, bias=bias
+        )
+        replace_module(model, name, quant_linear)
+        del linear.weight
+        del linear.bias
+        del linear
+    cleanup_memory()
+
+
+def quantize_activations(
+    model: AutoModelForCausalLM,
+    quantize_config: BaseQuantizeConfig,
+    calibration_tokens,
+):
+    # Replace weight quantizer with a dynamic activation quantizer observer
+    for name, dynamic_quant_linear in model.named_modules():
+        if (
+            not isinstance(dynamic_quant_linear, FP8DynamicLinear)
+            or name in quantize_config.ignored_layers
+        ):
+            continue
+        quantizer = FP8StaticLinearQuantizer(
+            weight=dynamic_quant_linear.weight,
+            weight_scale=dynamic_quant_linear.weight_scale,
+            bias=dynamic_quant_linear.bias,
+            quantize_output=(
+                hasattr(quantize_config, "kv_cache_quant_layers")
+                and name in quantize_config.kv_cache_quant_layers
+            ),
+        )
+        replace_module(model, name, quantizer)
+        del dynamic_quant_linear
+    cleanup_memory()
+
+    # Pass through calibration data to measure activation scales
+    with torch.inference_mode():
+        with tqdm.tqdm(
+            total=calibration_tokens.shape[0], desc="Calibrating activation scales"
+        ) as pbar:
+            for row_idx in range(calibration_tokens.shape[0]):
+                model(calibration_tokens[row_idx].reshape(1, -1))
+                cleanup_memory()
+                pbar.update(1)
+
+    # Replace dynamic quantizer observer with StaticLinear for export
+    for name, quantizer in model.named_modules():
+        if (
+            not isinstance(quantizer, FP8StaticLinearQuantizer)
+            or name in quantize_config.ignored_layers
+        ):
+            continue
+        static_proj = FP8StaticLinear(
+            weight=quantizer.weight,
+            weight_scale=quantizer.weight_scale,
+            bias=quantizer.bias,
+            input_scale=quantizer.input_scale,
+            output_scale=quantizer.output_scale,
+        )
+        replace_module(model, name, static_proj)
+        del quantizer
+    cleanup_memory()
+
+    # Post-process step for kv cache scales to take the k/v module
+    # `output_scale` parameters, and store them in the parent attention
+    # module as `k_scale` and `v_scale`
+    if hasattr(quantize_config, "kv_cache_quant_layers"):
+        # Assumes that list is ordered such that [layer0.k_proj, layer0.v_proj, layer1.k_proj, layer1.v_proj, ...]
+        # so we make a list of tuples [(layer0.k_proj, layer0.v_proj), (layer1.k_proj, layer1.v_proj), ...]
+        kv_proj_pairs = zip(*[iter(quantize_config.kv_cache_quant_layers)] * 2)
+        for k_proj_name, v_proj_name in kv_proj_pairs:
+            parent_module_name = ".".join(k_proj_name.split(".")[:-1])
+            assert parent_module_name == ".".join(v_proj_name.split(".")[:-1])
+            parent_module = dict(model.named_modules())[parent_module_name]
+
+            k_proj = dict(model.named_modules())[k_proj_name]
+            v_proj = dict(model.named_modules())[v_proj_name]
+
+            parent_module.k_scale = torch.nn.Parameter(
+                k_proj.output_scale, requires_grad=False
+            )
+            parent_module.v_scale = torch.nn.Parameter(
+                v_proj.output_scale, requires_grad=False
+            )
+
+            # Remove output_scale from k_proj and v_proj
+            k_proj.output_scale = None
+            v_proj.output_scale = None
+    cleanup_memory()
+
+
+def save_quantized_model(
+    model: AutoModelForCausalLM,
+    quant_config: BaseQuantizeConfig,
+    save_dir: str,
+):
+    print(model)
+    print(f"Saving the model to {save_dir}")
+    static_q_dict = {
+        "quantization_config": {
+            "quant_method": "fp8",
+            "activation_scheme": quant_config.activation_scheme,
+            "ignored_layers": quant_config.ignored_layers,
+        }
+    }
+    if hasattr(quant_config, "kv_cache_quant_layers"):
+        static_q_dict["quantization_config"]["kv_cache_scheme"] = "static"
+    model.config.update(static_q_dict)
+    model.save_pretrained(save_dir)
+    tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
+    tokenizer.save_pretrained(save_dir)
+
+
+def get_layers_to_ignore(model, ignore_patterns) -> List[str]:
+    ignored_layers = set()
+
+    for name, linear in model.named_modules():
+        if not isinstance(linear, torch.nn.Linear):
+            continue
+
+        for ignore_pattern in ignore_patterns:
+            regex_prefix = "re:"
+            if ignore_pattern.startswith(regex_prefix):
+                # check if name matches regex and add to set if true
+                regex_pattern = ignore_pattern[len(regex_prefix) :]
+                if re.search(regex_pattern, name):
+                    ignored_layers.add(name)
+            else:
+                # else, exact match
+                if ignore_pattern == name:
+                    ignored_layers.add(name)
+
+    return list(ignored_layers)
+
+
+def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
+    kv_cache_quant_layers = []
+
+    for name, linear in model.named_modules():
+        if not isinstance(linear, torch.nn.Linear):
+            continue
+
+        for output_quant_target in kv_cache_quant_targets:
+            if name.endswith(output_quant_target):
+                kv_cache_quant_layers.append(name)
+
+    return kv_cache_quant_layers
+
+
+def quantize_to_fp8_dynamic(input_model_dir: str, output_model_dir: str) -> None:
+    # Define quantization config with static activation scales
+    quantize_config = BaseQuantizeConfig(
+        quant_method="fp8", activation_scheme="dynamic"
+    )
+
+    # Load the model, quantize, and save checkpoint
+    model = AutoFP8ForCausalLM.from_pretrained(input_model_dir, quantize_config)
+    # No examples for dynamic quantization
+    model.quantize([])
+    model.save_quantized(output_model_dir)
+
+
+if __name__ == "__main__":
+    quantize_to_fp8_dynamic(sys.argv[0], sys.argv[1])
--- a/src/ui_update.py
+++ b/src/ui_update.py
@ -0,0 +1,227 @@
+from typing import Tuple
+
+import psutil
+from PySide6.QtCore import QTimer
+from PySide6.QtGui import Qt
+from PySide6.QtWidgets import QFileDialog, QLabel
+
+from Localizations import *
+from error_handling import show_error
+
+
+def resize_window(self, larger) -> None:
+    factor = 1.1 if larger else 1 / 1.1
+    current_width = self.width()
+    current_height = self.height()
+    new_width = int(current_width * factor)
+    new_height = int(current_height * factor)
+    self.resize(new_width, new_height)
+
+
+def reset_size(self) -> None:
+    self.resize(self.default_width, self.default_height)
+
+
+def parse_resolution(self) -> Tuple[int, int]:
+    res = os.environ.get("AUTOGGUF_RESOLUTION", "1650x1100")
+    try:
+        width, height = map(int, res.split("x"))
+        if width <= 0 or height <= 0:
+            raise ValueError
+        return width, height
+    except (ValueError, AttributeError):
+        return 1650, 1100
+
+
+def browse_base_model(self) -> None:
+    self.logger.info(BROWSING_FOR_BASE_MODEL_FOLDER)  # Updated log message
+    base_model_folder = QFileDialog.getExistingDirectory(self, SELECT_BASE_MODEL_FOLDER)
+    if base_model_folder:
+        self.base_model_path.setText(os.path.abspath(base_model_folder))
+
+
+def browse_hf_model_input(self) -> None:
+    self.logger.info(BROWSE_FOR_HF_MODEL_DIRECTORY)
+    model_dir = QFileDialog.getExistingDirectory(self, SELECT_HF_MODEL_DIRECTORY)
+    if model_dir:
+        self.hf_model_input.setText(os.path.abspath(model_dir))
+
+
+def browse_hf_outfile(self) -> None:
+    self.logger.info(BROWSE_FOR_HF_TO_GGUF_OUTPUT)
+    outfile, _ = QFileDialog.getSaveFileName(self, SELECT_OUTPUT_FILE, "", GGUF_FILES)
+    if outfile:
+        self.hf_outfile.setText(os.path.abspath(outfile))
+
+
+def browse_imatrix_datafile(self) -> None:
+    self.logger.info(BROWSING_FOR_IMATRIX_DATA_FILE)
+    datafile, _ = QFileDialog.getOpenFileName(self, SELECT_DATA_FILE, "", ALL_FILES)
+    if datafile:
+        self.imatrix_datafile.setText(os.path.abspath(datafile))
+
+
+def browse_imatrix_model(self) -> None:
+    self.logger.info(BROWSING_FOR_IMATRIX_MODEL_FILE)
+    model_file, _ = QFileDialog.getOpenFileName(self, SELECT_MODEL_FILE, "", GGUF_FILES)
+    if model_file:
+        self.imatrix_model.setText(os.path.abspath(model_file))
+
+
+def browse_imatrix_output(self) -> None:
+    self.logger.info(BROWSING_FOR_IMATRIX_OUTPUT_FILE)
+    output_file, _ = QFileDialog.getSaveFileName(
+        self, SELECT_OUTPUT_FILE, "", DAT_FILES
+    )
+    if output_file:
+        self.imatrix_output.setText(os.path.abspath(output_file))
+
+
+def create_label(self, text, tooltip) -> QLabel:
+    label = QLabel(text)
+    label.setToolTip(tooltip)
+    return label
+
+
+def toggle_gpu_offload_auto(self, state) -> None:
+    is_auto = state == Qt.CheckState.Checked
+    self.gpu_offload_slider.setEnabled(not is_auto)
+    self.gpu_offload_spinbox.setEnabled(not is_auto)
+
+
+def update_model_info(logger, model_info) -> None:
+    logger.debug(UPDATING_MODEL_INFO.format(model_info))
+    pass
+
+
+def update_system_info(self) -> None:
+    ram = psutil.virtual_memory()
+    cpu = psutil.cpu_percent()
+
+    # Smooth transition for RAM bar
+    animate_bar(self, self.ram_bar, ram.percent)
+
+    # Smooth transition for CPU bar
+    animate_bar(self, self.cpu_bar, cpu)
+
+    self.ram_bar.setFormat(
+        RAM_USAGE_FORMAT.format(
+            ram.percent, ram.used // 1024 // 1024, ram.total // 1024 // 1024
+        )
+    )
+    self.cpu_label.setText(CPU_USAGE_FORMAT.format(cpu))
+
+    # Collect CPU and RAM usage data
+    self.cpu_data.append(cpu)
+    self.ram_data.append(ram.percent)
+
+    if len(self.cpu_data) > 60:
+        self.cpu_data.pop(0)
+        self.ram_data.pop(0)
+
+
+def animate_bar(self, bar, target_value) -> None:
+    current_value = bar.value()
+    difference = target_value - current_value
+
+    if abs(difference) <= 1:  # Avoid animation for small changes
+        bar.setValue(target_value)
+        return
+
+    step = 1 if difference > 0 else -1  # Increment or decrement based on difference
+    timer = QTimer(self)
+    timer.timeout.connect(lambda: _animate_step(bar, target_value, step, timer))
+    timer.start(10)  # Adjust the interval for animation speed
+
+
+def _animate_step(bar, target_value, step, timer) -> None:
+    current_value = bar.value()
+    new_value = current_value + step
+
+    if (step > 0 and new_value > target_value) or (
+        step < 0 and new_value < target_value
+    ):
+        bar.setValue(target_value)
+        timer.stop()
+    else:
+        bar.setValue(new_value)
+
+
+def update_download_progress(self, progress) -> None:
+    self.download_progress.setValue(progress)
+
+
+def update_cuda_backends(self) -> None:
+    self.logger.debug(UPDATING_CUDA_BACKENDS)
+    self.backend_combo_cuda.clear()
+    llama_bin = os.path.abspath("llama_bin")
+    if os.path.exists(llama_bin):
+        for item in os.listdir(llama_bin):
+            item_path = os.path.join(llama_bin, item)
+            if os.path.isdir(item_path) and "cudart-llama" not in item.lower():
+                if (
+                    "cu1" in item.lower() or "cuda-1" in item.lower()
+                ):  # Only include CUDA-capable backends
+                    self.backend_combo_cuda.addItem(item, userData=item_path)
+
+    if self.backend_combo_cuda.count() == 0:
+        self.backend_combo_cuda.addItem(NO_SUITABLE_CUDA_BACKENDS)
+        self.backend_combo_cuda.setEnabled(False)
+    else:
+        self.backend_combo_cuda.setEnabled(True)
+
+
+def update_threads_spinbox(self, value) -> None:
+    self.threads_spinbox.setValue(value)
+
+
+def update_threads_slider(self, value) -> None:
+    self.threads_slider.setValue(value)
+
+
+def update_gpu_offload_spinbox(self, value) -> None:
+    self.gpu_offload_spinbox.setValue(value)
+
+
+def update_gpu_offload_slider(self, value) -> None:
+    self.gpu_offload_slider.setValue(value)
+
+
+def update_cuda_option(self) -> None:
+    self.logger.debug(UPDATING_CUDA_OPTIONS)
+    asset = self.asset_combo.currentData()
+
+    # Handle the case where asset is None
+    if asset is None:
+        self.logger.warning(NO_ASSET_SELECTED_FOR_CUDA_CHECK)
+        self.cuda_extract_checkbox.setVisible(False)
+        self.cuda_backend_label.setVisible(False)
+        self.backend_combo_cuda.setVisible(False)
+        return  # Exit the function early
+
+    is_cuda = asset and "cudart" in asset["name"].lower()
+    self.cuda_extract_checkbox.setVisible(is_cuda)
+    self.cuda_backend_label.setVisible(is_cuda)
+    self.backend_combo_cuda.setVisible(is_cuda)
+    if is_cuda:
+        self.update_cuda_backends()
+
+
+def update_assets(self) -> None:
+    self.logger.debug(UPDATING_ASSET_LIST)
+    self.asset_combo.clear()
+    release = self.release_combo.currentData()
+    if release:
+        if "assets" in release:
+            for asset in release["assets"]:
+                self.asset_combo.addItem(asset["name"], userData=asset)
+        else:
+            show_error(
+                self.logger, NO_ASSETS_FOUND_FOR_RELEASE.format(release["tag_name"])
+            )
+    self.update_cuda_option()
+
+
+def update_base_model_visibility(self, index) -> None:
+    is_gguf = self.lora_output_type_combo.itemText(index) == "GGUF"
+    self.base_model_wrapper.setVisible(is_gguf)
--- a/src/utils.py
+++ b/src/utils.py
@ -0,0 +1,219 @@
+from typing import Any, Union
+
+import urllib.request
+import urllib.error
+import json
+import ssl
+import certifi
+from PySide6.QtCore import Qt
+from PySide6.QtWidgets import QFileDialog, QInputDialog, QMenu
+
+from DownloadThread import DownloadThread
+from Localizations import *
+from error_handling import show_error
+from globals import ensure_directory
+from KVOverrideEntry import KVOverrideEntry
+
+
+def show_model_context_menu(self, position):
+    item = self.model_tree.itemAt(position)
+    if item:
+        # Child of a sharded model or top-level item without children
+        if item.parent() is not None or item.childCount() == 0:
+            menu = QMenu()
+            rename_action = menu.addAction(RENAME)
+            delete_action = menu.addAction(DELETE)
+
+            action = menu.exec(self.model_tree.viewport().mapToGlobal(position))
+            if action == rename_action:
+                self.rename_model(item)
+            elif action == delete_action:
+                self.delete_model(item)
+
+
+def rename_model(self, item):
+    old_name = item.text(0)
+    new_name, ok = QInputDialog.getText(self, RENAME, f"New name for {old_name}:")
+    if ok and new_name:
+        old_path = os.path.join(self.models_input.text(), old_name)
+        new_path = os.path.join(self.models_input.text(), new_name)
+        try:
+            os.rename(old_path, new_path)
+            item.setText(0, new_name)
+            self.logger.info(MODEL_RENAMED_SUCCESSFULLY.format(old_name, new_name))
+        except Exception as e:
+            show_error(self.logger, f"Error renaming model: {e}")
+
+
+def add_kv_override(self, override_string=None) -> None:
+    entry = KVOverrideEntry()
+    entry.deleted.connect(self.remove_kv_override)
+    if override_string:
+        key, value = override_string.split("=")
+        type_, val = value.split(":")
+        entry.key_input.setText(key)
+        entry.type_combo.setCurrentText(type_)
+        entry.value_input.setText(val)
+    self.kv_override_layout.addWidget(entry)
+    self.kv_override_entries.append(entry)
+
+
+def remove_kv_override(self, entry) -> None:
+    self.kv_override_layout.removeWidget(entry)
+    self.kv_override_entries.remove(entry)
+    entry.deleteLater()
+
+
+def get_models_data(self) -> list[dict[str, Union[str, Any]]]:
+    models = []
+    root = self.model_tree.invisibleRootItem()
+    child_count = root.childCount()
+    for i in range(child_count):
+        item = root.child(i)
+        model_name = item.text(0)
+        model_type = "sharded" if "sharded" in model_name.lower() else "single"
+        model_path = item.data(0, Qt.ItemDataRole.UserRole)
+        models.append({"name": model_name, "type": model_type, "path": model_path})
+    return models
+
+
+def get_tasks_data(self) -> list[dict[str, Union[int, Any]]]:
+    tasks = []
+    for i in range(self.task_list.count()):
+        item = self.task_list.item(i)
+        task_widget = self.task_list.itemWidget(item)
+        if task_widget:
+            tasks.append(
+                {
+                    "name": task_widget.task_name,
+                    "status": task_widget.status,
+                    "progress": (
+                        task_widget.progress_bar.value()
+                        if hasattr(task_widget, "progress_bar")
+                        else 0
+                    ),
+                    "log_file": task_widget.log_file,
+                }
+            )
+    return tasks
+
+
+def browse_models(self) -> None:
+    self.logger.info(BROWSING_FOR_MODELS_DIRECTORY)
+    models_path = QFileDialog.getExistingDirectory(self, SELECT_MODELS_DIRECTORY)
+    if models_path:
+        self.models_input.setText(os.path.abspath(models_path))
+        ensure_directory(models_path)
+        self.load_models()
+
+
+def browse_output(self) -> None:
+    self.logger.info(BROWSING_FOR_OUTPUT_DIRECTORY)
+    output_path = QFileDialog.getExistingDirectory(self, SELECT_OUTPUT_DIRECTORY)
+    if output_path:
+        self.output_input.setText(os.path.abspath(output_path))
+        ensure_directory(output_path)
+
+
+def browse_logs(self) -> None:
+    self.logger.info(BROWSING_FOR_LOGS_DIRECTORY)
+    logs_path = QFileDialog.getExistingDirectory(self, SELECT_LOGS_DIRECTORY)
+    if logs_path:
+        self.logs_input.setText(os.path.abspath(logs_path))
+        ensure_directory(logs_path)
+
+
+def browse_imatrix(self) -> None:
+    self.logger.info(BROWSING_FOR_IMATRIX_FILE)
+    imatrix_file, _ = QFileDialog.getOpenFileName(
+        self, SELECT_IMATRIX_FILE, "", DAT_FILES
+    )
+    if imatrix_file:
+        self.imatrix.setText(os.path.abspath(imatrix_file))
+
+
+def browse_lora_input(self) -> None:
+    self.logger.info(BROWSING_FOR_LORA_INPUT_DIRECTORY)
+    lora_input_path = QFileDialog.getExistingDirectory(
+        self, SELECT_LORA_INPUT_DIRECTORY
+    )
+    if lora_input_path:
+        self.lora_input.setText(os.path.abspath(lora_input_path))
+        ensure_directory(lora_input_path)
+
+
+def browse_lora_output(self) -> None:
+    self.logger.info(BROWSING_FOR_LORA_OUTPUT_FILE)
+    lora_output_file, _ = QFileDialog.getSaveFileName(
+        self, SELECT_LORA_OUTPUT_FILE, "", GGUF_AND_BIN_FILES
+    )
+    if lora_output_file:
+        self.lora_output.setText(os.path.abspath(lora_output_file))
+
+
+def download_llama_cpp(self) -> None:
+    self.logger.info(STARTING_LLAMACPP_DOWNLOAD)
+    asset = self.asset_combo.currentData()
+    if not asset:
+        show_error(self.logger, NO_ASSET_SELECTED)
+        return
+
+    llama_bin = os.path.abspath("llama_bin")
+    os.makedirs(llama_bin, exist_ok=True)
+
+    save_path = os.path.join(llama_bin, asset["name"])
+
+    self.download_thread = DownloadThread(asset["browser_download_url"], save_path)
+    self.download_thread.progress_signal.connect(self.update_download_progress)
+    self.download_thread.finished_signal.connect(self.download_finished)
+    self.download_thread.error_signal.connect(self.download_error)
+    self.download_thread.start()
+
+    self.download_button.setEnabled(False)
+    self.download_progress.setValue(0)
+
+
+def get_repo_from_env() -> tuple[str, str]:
+    repo = os.getenv("AUTOGGUF_BACKEND_REPO", "ggerganov/llama.cpp")
+
+    if not repo or "/" not in repo:
+        raise ValueError(INVALID_REPOSITORY_FORMAT)
+
+    owner, repo_name = repo.split("/", 1)
+    if not all(part.strip() for part in (owner, repo_name)):
+        raise ValueError(REPO_CANNOT_BE_EMPTY)
+
+    return owner, repo_name
+
+
+def refresh_releases(self) -> None:
+    self.logger.info(REFRESHING_LLAMACPP_RELEASES)
+    try:
+        owner, repo = get_repo_from_env()
+        url = f"https://api.github.com/repos/{owner}/{repo}/releases"
+
+        # Create SSL context with certifi certificates
+        ssl_context = ssl.create_default_context(cafile=certifi.where())
+
+        # Create request
+        req = urllib.request.Request(url)
+
+        # Make the request
+        with urllib.request.urlopen(req, context=ssl_context) as response:
+            if response.status != 200:
+                raise urllib.error.HTTPError(
+                    url, response.status, "HTTP Error", response.headers, None
+                )
+
+            releases = json.loads(response.read().decode("utf-8"))
+
+        self.release_combo.clear()
+        for release in releases:
+            self.release_combo.addItem(release["tag_name"], userData=release)
+        self.release_combo.currentIndexChanged.connect(self.update_assets)
+        self.update_assets()
+
+    except ValueError as e:
+        show_error(self.logger, f"Invalid repository configuration: {str(e)}")
+    except (urllib.error.URLError, urllib.error.HTTPError) as e:
+        show_error(self.logger, ERROR_FETCHING_RELEASES.format(str(e)))
				`@ -0,0 +1 @@`
				`/* Leave this file blank for default theme */`