Compare commits

..

No commits in common. "main" and "v1.7.0" have entirely different histories.
main ... v1.7.0

66 changed files with 3072 additions and 14280 deletions

View File

@ -1,13 +0,0 @@
AUTOGGUF_RESOLUTION=1650x1100
AUTOGGUF_THEME=
AUTOGGUF_CHECK_BACKEND=disabled
AUTOGGUF_CHECK_UPDATE=disabled
AUTOGGUF_SERVER_API_KEY=
AUTOGGUF_MODEL_DIR_NAME=models
AUTOGGUF_OUTPUT_DIR_NAME=quantized_models
AUTOGGUF_RESIZE_FACTOR=1.1
AUTOGGUF_SERVER=enabled
AUTOGGUF_SERVER_PORT=7001
AUTOGGUF_SERVER_API_KEY=
AUTOGGUF_LANGUAGE=en-US
AUTOGGUF_BACKEND_REPO=ggerganov/llama.cpp

View File

@ -12,8 +12,8 @@ jobs:
lint: lint:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v2
- uses: actions/setup-python@v5 - uses: actions/setup-python@v2
- uses: psf/black@stable - uses: psf/black@stable
with: with:
options: "--check --verbose" options: "--check --verbose"

View File

@ -18,17 +18,18 @@ jobs:
matrix: matrix:
os: [windows-latest, ubuntu-latest, macos-latest] os: [windows-latest, ubuntu-latest, macos-latest]
arch: [x64] arch: [x64]
include:
- os: windows-latest
arch: x86
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
outputs:
artifact-names: ${{ steps.set-outputs.outputs.artifact-names }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v2
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v5 uses: actions/setup-python@v2
with: with:
python-version: '3.12' python-version: '3.x'
architecture: ${{ matrix.arch }} architecture: ${{ matrix.arch }}
- name: Install dependencies - name: Install dependencies
@ -60,76 +61,23 @@ jobs:
if: matrix.os == 'windows-latest' if: matrix.os == 'windows-latest'
run: | run: |
$distPath = if ("${{ github.event.inputs.build_type }}" -eq "RELEASE") { "build\release\dist" } else { "build\dev\dist" } $distPath = if ("${{ github.event.inputs.build_type }}" -eq "RELEASE") { "build\release\dist" } else { "build\dev\dist" }
New-Item -ItemType Directory -Force -Path "$distPath\src\gguf" New-Item -ItemType Directory -Force -Path "$distPath\src\gguf-py"
Copy-Item -Path "src\gguf\*" -Destination "$distPath\src\gguf" -Recurse Copy-Item -Path "src\gguf-py\*" -Destination "$distPath\src\gguf-py" -Recurse
Copy-Item -Path "src\convert_hf_to_gguf.py" -Destination "$distPath\src"
Copy-Item -Path "src\convert_lora_to_gguf.py" -Destination "$distPath\src" Copy-Item -Path "src\convert_lora_to_gguf.py" -Destination "$distPath\src"
Copy-Item -Path "src\convert_lora_to_ggml.py" -Destination "$distPath\src" Copy-Item -Path "src\convert_lora_to_ggml.py" -Destination "$distPath\src"
Copy-Item -Path "src\quantize_to_fp8_dynamic.py" -Destination "$distPath\src"
Copy-Item -Path ".env.example" -Destination "$distPath\"
- name: Copy additional files (Linux/macOS) - name: Copy additional files (Linux/macOS)
if: matrix.os != 'windows-latest' if: matrix.os != 'windows-latest'
run: | run: |
distPath=$(if [ "${{ github.event.inputs.build_type }}" = "RELEASE" ]; then echo "build/release/dist"; else echo "build/dev/dist"; fi) distPath=$(if [ "${{ github.event.inputs.build_type }}" = "RELEASE" ]; then echo "build/release/dist"; else echo "build/dev/dist"; fi)
mkdir -p $distPath/src/gguf mkdir -p $distPath/src/gguf-py
cp -R src/gguf/* $distPath/src/gguf/ cp -R src/gguf-py/* $distPath/src/gguf-py/
cp src/convert_hf_to_gguf.py $distPath/src/
cp src/convert_lora_to_gguf.py $distPath/src/ cp src/convert_lora_to_gguf.py $distPath/src/
cp src/convert_lora_to_ggml.py $distPath/src/ cp src/convert_lora_to_ggml.py $distPath/src/
cp src/quantize_to_fp8_dynamic.py $distPath/src/
cp .env.example $distPath/
- name: Set outputs for artifact name
id: set-outputs
run: echo "artifact-name=AutoGGUF-${{ matrix.os }}-${{ matrix.arch }}-${{ github.event.inputs.build_type }}-${{ github.sha }}" >> $GITHUB_OUTPUT
- name: Upload Artifact - name: Upload Artifact
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v2
with: with:
name: AutoGGUF-${{ matrix.os }}-${{ matrix.arch }}-${{ github.event.inputs.build_type }}-${{ github.sha }} name: AutoGGUF-${{ matrix.os }}-${{ matrix.arch }}-${{ github.event.inputs.build_type }}-${{ github.sha }}
path: build/${{ github.event.inputs.build_type == 'RELEASE' && 'release' || 'dev' }}/dist path: build/${{ github.event.inputs.build_type == 'RELEASE' && 'release' || 'dev' }}/dist
generate-checksums:
needs: build
runs-on: ubuntu-latest
steps:
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: ./artifacts
- name: Generate SHA256 checksums for all artifacts
run: |
cd artifacts
versionHash=$(echo ${{ github.sha }} | cut -c1-7)
echo "# AutoGGUF Build Checksums" > ../checksums.txt
echo "Build: ${{ github.event.inputs.build_type }}" >> ../checksums.txt
echo "Commit: ${{ github.sha }}" >> ../checksums.txt
echo "Date: $(date -u)" >> ../checksums.txt
echo "" >> ../checksums.txt
# Find all artifact directories and generate checksums of their zip equivalents
for artifact_dir in AutoGGUF-*-${{ github.event.inputs.build_type }}-${{ github.sha }}; do
if [ -d "$artifact_dir" ]; then
echo "Processing $artifact_dir..."
cd "$artifact_dir"
# Create a temporary zip to calculate hash (simulating what GitHub creates)
zip -r "../temp_${artifact_dir}.zip" .
cd ..
# Generate SHA256 of the zip file
hash=$(sha256sum "temp_${artifact_dir}.zip" | cut -d' ' -f1)
echo "${hash} ${artifact_dir}.zip" >> ../checksums.txt
# Clean up the temporary zip
rm "temp_${artifact_dir}.zip"
fi
done
- name: Upload checksums
uses: actions/upload-artifact@v4
with:
name: AutoGGUF-${{ github.sha }}-SHA256
path: checksums.txt

View File

@ -77,7 +77,7 @@ jobs:
# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
# queries: security-extended,security-and-quality # queries: security-extended,security-and-quality
# If the analysis step fails for one of the languages you are analyzing with # If the analyze step fails for one of the languages you are analyzing with
# "We were unable to automatically build your code", modify the matrix above # "We were unable to automatically build your code", modify the matrix above
# to set the build mode to "manual" for that language. Then modify this step # to set the build mode to "manual" for that language. Then modify this step
# to build your code. # to build your code.

View File

@ -14,10 +14,10 @@ jobs:
audit: audit:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v2
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v5 uses: actions/setup-python@v2
with: with:
python-version: '3.x' python-version: '3.x'
@ -52,7 +52,7 @@ jobs:
cat requirements.txt >> detailed_report.txt cat requirements.txt >> detailed_report.txt
- name: Upload audit results - name: Upload audit results
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v2
with: with:
name: pip-audit-report name: pip-audit-report
path: detailed_report.txt path: detailed_report.txt

View File

@ -1,4 +1,5 @@
name: Pylint name: Pylint
on: on:
push: push:
paths: paths:
@ -6,23 +7,23 @@ on:
pull_request: pull_request:
paths: paths:
- '**.py' - '**.py'
jobs: jobs:
build: build:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
python-version: ["3.9", "3.10"] python-version: ["3.8", "3.9", "3.10"]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5 uses: actions/setup-python@v3
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install $(grep -v "^torch" requirements.txt | tr '\n' ' ') pip install pylint PyQt6 psutil requests
pip install pylint
- name: Analysing the code with pylint - name: Analysing the code with pylint
run: | run: |
pylint $(git ls-files '*.py') --disable=all --enable=E0001,E0100,E0101,E0102,E0103,E0104,E0105,E0107,E0108,E0110,E0111,E0112,E0113,E0114,E0115,E0116,E0117,E0118,E0202,E0203,E0211,E0213,E0236,E0237,E0238,E0239,E0240,E0241,E0301,E0302,E0303,E0401,E0402,E0701,E0702,E0703,E0704,E0710,E0711,E0712,E1003,E1101,E1102,E1111,E1120,E1121,E1123,E1124,E1125,E1126,E1127,E1128,E1129,E1130,E1131,E1132,E1133,E1134,E1135,E1136,E1137,E1138,E1139,E1200,E1201,E1205,E1206,E1300,E1301,E1302,E1303,E1304,E1305,E1306,E1310,E1700,E1701,W0311,W0312,W0611,W0612,W0613,W0702,W1401,W1402,C0123,C0200,C0325,C0411,C0412 --fail-under=5 pylint $(git ls-files '*.py') --disable=all --enable=E0001,E0100,E0101,E0102,E0103,E0104,E0105,E0107,E0108,E0110,E0111,E0112,E0113,E0114,E0115,E0116,E0117,E0118,E0202,E0203,E0211,E0213,E0236,E0237,E0238,E0239,E0240,E0241,E0301,E0302,E0303,E0401,E0402,E0701,E0702,E0703,E0704,E0710,E0711,E0712,E1003,E1101,E1102,E1111,E1120,E1121,E1123,E1124,E1125,E1126,E1127,E1128,E1129,E1130,E1131,E1132,E1133,E1134,E1135,E1136,E1137,E1138,E1139,E1200,E1201,E1205,E1206,E1300,E1301,E1302,E1303,E1304,E1305,E1306,E1310,E1700,E1701,W0311,W0312,W0611,W0612,W0613,W0702,W1401,W1402,C0123,C0200,C0325,C0411,C0412 --fail-under=5

View File

@ -13,12 +13,12 @@ jobs:
radon: radon:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v3
with: with:
fetch-depth: 0 fetch-depth: 0
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v5 uses: actions/setup-python@v4
with: with:
python-version: '3.x' python-version: '3.x'

14
.gitignore vendored
View File

@ -13,28 +13,18 @@ __pycache__/
# Allow these files # Allow these files
!.pre-commit-config.yaml !.pre-commit-config.yaml
!.env.example
!setup.py
# Allow src folder and its .py files # Allow src folder and its .py files
!src/ !src/
src/* src/*
!src/*.py !src/*.py
!src/gguf
src/gguf/*
!src/gguf/*.py
# Allow docs folder and its .py files # Allow docs folder and its .py files
!docs/ !docs/
docs/* docs/*
!docs/*.py !docs/*.py
# Allow plugins folder and its .py files # Allow assets folder, but only .svg, .png, .rc and .ico files
!plugins/
plugins/*
!plugins/*.py
# Allow assets folder, but only .svg, .png, .rc, .css, .iss and .ico files
!assets/ !assets/
assets/* assets/*
!assets/*.svg !assets/*.svg
@ -42,8 +32,6 @@ assets/*
!assets/*.ico !assets/*.ico
!assets/*.rc !assets/*.rc
!assets/*.res !assets/*.res
!assets/*.css
!assets/*.iss
# Allow .github folder and its contents # Allow .github folder and its contents
!.github/ !.github/

View File

@ -1,209 +1,5 @@
# Changelog # Changelog
## [v2.0.1] - 2025-05-24
### Added
- Human readable mappings from KV pairs into model properties
- certifi library for backend download and update checking
- Automated checksums in CI process
### Changed
- Updated llama.cpp backend
- Improved backend UI, logging, and task handling
- Enhanced display of model properties and cleaner formatting of KV pairs
- Updated tensor data formatting and removed redundant KV pairs property
- Updated CUDA backend check for latest llama.cpp release format
- Global urllib usage implementation
- Updated README with more information about patches and updates
- Edited quick start instructions
- Small file formatting improvements
### Fixed
- Type hints corrections
- Build errors in CI
- `@upload-artifact` updated to v4
## [v2.0.0] - 2025-01-27
### Added
- Clipboard support for save/load preset functionality with shift-click option
- Support for shift-clicking to get quantization command
- AUTOGGUF_BACKEND_REPO environment variable for custom GitHub repository fetching
- New HF to GGUF conversion types: `tq1_0` and `tq2_0`
### Changed
- Updated multiple dependencies:
- PySide6, PyTorch, Transformers, FastAPI, uvicorn, and other core libraries to their latest compatible versions
- Adjusted monitoring intervals from 0.2s to 0.5s
- Updated copyright year to 2025
- Bundled llama.cpp licensing text in About menu
- Removed x86 build matrix from CI
- Removed Import Model confirmation dialog
### Fixed
- Resolved PySide6 segfault issue
- Fixed error when deleting models from list
- Corrected incorrect menu bar name for Load Preset
## [v1.9.1] - 2024-10-13
### Added
- Support for specifying log directory name using AUTOGGUF_LOG_DIR_NAME environment variable
- Work in progress GGUF merge window
- Support for repository types in HF Transfer utility
- New `dequantize_gguf.py` script
- Support for MiniCPM3, RWKVv6, OLMoE, IBM Granite, and Jamba in llama.cpp convert scripts (conversion only)
- Add Nuitka build script for Linux
### Changed
- Updated Finnish and Russian localizations using Claude 3 Opus
- Improved layout of HF Upload window
- Updated gguf library from upstream
- Refactored code to use localizations for menubar
- Renamed imports_and_globals.py to globals.py
- Moved general functions verify_gguf and process_args to globals.py
- Created Plugins class for extensibility
- Updated dependencies:
- huggingface-hub
- fastapi (~=0.115.0)
- setuptools (~=75.1.0)
- pyside6 (~=6.7.3)
- uvicorn (~=0.31.0)
### Fixed
- Corrected localization strings and file select types for GGUF merging
- Fix minor errors in build scripts
## [v1.9.0] - 2024-09-15
### Added
- Implemented Hugging Face (HF) upload functionality with GUI definitions
- Added RAM and CPU usage graphs to UI
- Input validation using wraps added to UI
- Right-click context menu added to the models list in UI
- Support for iMatrix generation tracking
- GGUF splitting feature added
- Japanese and German localizations updated
### Changed
- Refactored to move functions out of `AutoGGUF` to reduce bloat
- Localized GGUF split strings
- Optimized GGUF imports and renamed related modules
- Removed old `HFTransfer` class
- Adjusted logging strings and updated French and Dutch localizations
- Improved startup time by optimizing default configuration, disabling network fetches for backends/updates
- Removed `requests` and `python-dotenv` to reduce size
- Updated `fastapi` requirement from `~=0.112.2` to `~=0.114.2`
- Updated `torch` requirement from `~=2.4.0` to `~=2.4.1`
- Updated `setuptools` requirement from `~=74.0.0` to `~=74.1.2`
- Updated `safetensors` requirement from `~=0.4.4` to `~=0.4.5`
- Updated `huggingface-hub` requirement from `~=0.24.6` to `~=0.24.7`
### Fixed
- Adjusted indeterminate progress bar behavior
- Removed comments in `requirements.txt` and updated its formatting
## [v1.8.1] - 2024-09-04
### Added
- AutoFP8 quantization classes and window (currently WIP)
- Minimize/maximize buttons to title bar
- API key authentication support for the local server
- HuggingFace upload/download class
- OpenAPI docs for endpoints
- Added new showcase image
### Changed
- Replaced Flask with FastAPI and Uvicorn for improved performance
- Moved functions out of AutoGGUF.py into utils.py and TaskListItem.py
- Updated llama.cpp convert scripts
- Improved LoRA conversion process:
- Allow specifying output path in arguments
- Removed shutil.move operation
- Increased max number of LoRA layers
- Changed default port to 7001
- Now binding to localhost (127.0.0.1) instead of 0.0.0.0
- Upadted Spanish localizations
- Updated setuptools requirement from ~=68.2.0 to ~=74.0.0
- Updated .env.example with new configuration parameters
### Fixed
- Web page not found error
- Use of proper status in TaskListItem
- Passing of quant_threads and Logger to TaskListItem
- Improved window moving smoothness
- Prevention of moving window below taskbar
- Optimized imports in various files
- Remove aliased quant types
## [v1.8.0] - 2024-08-26
### Added
- .env.example file added
- Sha256 generation support added to build.yml
- Allow importing models from any directory on the system
- Added manual model import functionality
- Verification for manual imports and support for concatenated files
- Implemented plugins feature using importlib
- Configuration options for AUTOGGUF_MODEL_DIR_NAME, AUTOGGUF_OUTPUT_DIR_NAME, and AUTOGGUF_RESIZE_FACTOR added
### Changed
- Moved get helper functions to utils.py
- Added type hints
- Reformat TaskListItem.py for better readability
- Separate macOS and Linux runs in CI/CD
- Updated .gitignore for better file management
- Updated numpy requirement from <2.0.0 to <3.0.0
### Fixed
- Fixed sha256 file format and avoided overwriting
- Updated regex for progress tracking
- Arabic and French localizations fixed
- Only count valid backends instead of total backend combos
- Import missing modules
## [v1.7.2] - 2024-08-19
### Added
- Update checking support (controlled by AUTOGGUF_CHECK_UPDATE environment variable)
- Live update support for GPU monitor graphs
- Smoother usage bar changes in monitor
- Unicode X button in KV Overrides box
- PyPI setup script
- Inno Setup build file
- Missing requirements and dotenv file loading
### Changed
- Moved functions out of AutoGGUF.py
- Relocated CustomTitleBar to separate file
- Updated torch requirement from ~=2.2.0 to ~=2.4.0
- Updated showcase image
- Version bumped to v1.7.2 in Localizations.py
### Fixed
- setup.py issues
## [v1.7.1] - 2024-08-16
### Added
- Modern UI with seamless title bar
- Window resizing shortcuts (Ctrl+, Ctrl-, Ctrl+0)
- Theming support
- CPU usage bar
- Save Preset and Load Preset options in File menu
- Support for EXAONE model type
- Window size configuration through environment variables
### Changed
- Refactored window to be scrollable
- Moved save/load preset logic to presets.py
- Updated docstrings for AutoGGUF.py, lora_conversion.py, and Logger.py
- Adapted gguf library to project standards
### Fixed
- Updated version to v1.7.0
- Fixed IDE-detected code typos and errors
## [v1.7.0] - 2024-08-16 ## [v1.7.0] - 2024-08-16
### Added ### Added
@ -260,7 +56,7 @@ ### Notes
- Fast build: Higher unzipped size (97MB), smaller download (38MB) - Fast build: Higher unzipped size (97MB), smaller download (38MB)
- Standard build: Created with PyInstaller, medium download and unzipped size (50MB), potentially slower - Standard build: Created with PyInstaller, medium download and unzipped size (50MB), potentially slower
## [v1.6.0] - 2024-08-08 ## [1.6.0] - 2024-08-08
### Changed ### Changed
- Resolve licensing issues by using PySide6 - Resolve licensing issues by using PySide6
@ -268,7 +64,7 @@ ### Changed
### Added ### Added
- Add GPU monitoring support for NVIDIA GPUs - Add GPU monitoring support for NVIDIA GPUs
## [v1.5.1] - 2024-08-08 ## [1.5.1] - 2024-08-08
### Changed ### Changed
- Refactor localizations to use them in HF conversion area - Refactor localizations to use them in HF conversion area
@ -280,7 +76,7 @@ ### Removed
### Added ### Added
- Support loading *.gguf file types - Support loading *.gguf file types
## [v1.5.0] - 2024-08-06 ## [1.5.0] - 2024-08-06
### Changed ### Changed
- Refactor localizations to use them in HF conversion area - Refactor localizations to use them in HF conversion area
@ -293,7 +89,7 @@ ### Added
### Fixed ### Fixed
- Fix scaling on low resolution screens, interface now scrolls - Fix scaling on low resolution screens, interface now scrolls
## [v1.4.3] - 2024-08-05 ## [1.4.3] - 2024-08-05
### Changed ### Changed
- Updated src file in release to be Black formatted - Updated src file in release to be Black formatted
@ -306,7 +102,7 @@ ### Added
- Added model sharding management support - Added model sharding management support
- Allow multiple quantization types to be selected and started simultaneously - Allow multiple quantization types to be selected and started simultaneously
## [v1.4.2] - 2024-08-04 ## [1.4.2] - 2024-08-04
### Fixed ### Fixed
- Resolves bug where Base Model text was shown even when GGML type was selected - Resolves bug where Base Model text was shown even when GGML type was selected
@ -315,13 +111,13 @@ ### Fixed
### Changed ### Changed
- Minor repository changes - Minor repository changes
## [v1.4.1] - 2024-08-04 ## [1.4.1] - 2024-08-04
### Added ### Added
- Dynamic KV Overrides (see wiki: AutoGGUF/wiki/Dynamic-KV-Overrides) - Dynamic KV Overrides (see wiki: AutoGGUF/wiki/Dynamic-KV-Overrides)
- Quantization commands are now printed and logged - Quantization commands are now printed and logged
## [v1.4.0] - 2024-08-04 ## [1.4.0] - 2024-08-04
### Added ### Added
- LoRA Conversion: - LoRA Conversion:
@ -345,7 +141,7 @@ ### Added
- Currently includes src folder with conversion tools - Currently includes src folder with conversion tools
- No console window popup - No console window popup
## [v1.3.1] - 2024-08-04 ## [1.3.1] - 2024-08-04
### Added ### Added
- AUTOGGUF_CHECK_BACKEND environment variable to disable backend check on start - AUTOGGUF_CHECK_BACKEND environment variable to disable backend check on start
@ -353,7 +149,7 @@ ### Added
### Changed ### Changed
- --onefile build with PyInstaller, _internal directory is no longer required - --onefile build with PyInstaller, _internal directory is no longer required
## [v1.3.0] - 2024-08-03 ## [1.3.0] - 2024-08-03
### Added ### Added
- Support for new llama-imatrix parameters: - Support for new llama-imatrix parameters:
@ -375,7 +171,7 @@ ### Fixed
### Removed ### Removed
- Duplicated functions - Duplicated functions
## [v1.2.1] - 2024-08-03 ## [1.2.1] - 2024-08-03
### Added ### Added
- Refresh Models button - Refresh Models button
@ -384,13 +180,13 @@ ### Added
### Fixed ### Fixed
- iostream llama.cpp issue, quantized_models directory created on launch - iostream llama.cpp issue, quantized_models directory created on launch
## [v1.2.0] - 2024-08-03 ## [1.2.0] - 2024-08-03
### Added ### Added
- More robust logging (find logs at latest_<timestamp>.log in logs folder) - More robust logging (find logs at latest_<timestamp>.log in logs folder)
- Localizations with support for 28 languages (machine translated using Gemini Experimental 0801) - Localizations with support for 28 languages (machine translated using Gemini Experimental 0801)
## [v1.1.0] - 2024-08-03 ## [1.1.0] - 2024-08-03
### Added ### Added
- Dynamic KV override functionality - Dynamic KV override functionality
@ -413,7 +209,7 @@ ### Added
### Fixed ### Fixed
- Issue where quantization errored with "AutoGGUF does not have x attribute" - Issue where quantization errored with "AutoGGUF does not have x attribute"
## [v1.0.0] - 2024-08-02 ## [1.0.0] - 2024-08-02
### Added ### Added
- Initial release - Initial release

View File

@ -2,6 +2,8 @@ # Contributing to AutoGGUF
First off, thanks for taking the time to contribute! 🎉👍 First off, thanks for taking the time to contribute! 🎉👍
## How Can I Contribute?
### Reporting Bugs ### Reporting Bugs
- Use the issue tracker to report bugs - Use the issue tracker to report bugs
@ -13,18 +15,17 @@ ### Suggesting Enhancements
- Use the issue tracker to suggest enhancements - Use the issue tracker to suggest enhancements
- Explain why this enhancement would be useful - Explain why this enhancement would be useful
### Code Contributions ### Your First Code Contribution
You can find issues labeled with "good first issue" in the Issues tab as a starting point. Code refactors and optimizations are also appreciated, although if there's a vulnrability please report it privately in the Security tab. For feature PRs, please make a discussion first to make sure your feature can be added and continously maintained. You can find issues labeled with "good first issue" in the Issues tab as a starting point. Code refactors and optimizations are also appreciated, although if there's a vulnrability please report it privately in the Security tab. For feature PRs, please make a discussion first to make sure your feature can be added and continously maintained.
1. Fork the repo 1. Fork the repo
2. Clone your fork (`git clone https://github.com/your-username/AutoGGUF.git && cd AutoGGUF`) 2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
3. Create your feature branch (`git checkout -b feature/AmazingFeature`) 3. Install pre-commit: (`pip install pre-commit`)
5. Install pre-commit: (`pip install pre-commit`) 4. Set up the git hook scripts: (`pre-commit install`)
6. Set up the git hook scripts: (`pre-commit install`) 5. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
7. Commit your changes (`git commit -m 'Add some AmazingFeature'`) 6. Push to the branch (`git push origin feature/AmazingFeature`)
8. Push to the branch (`git push origin feature/AmazingFeature`) 7. Open a Pull Request
9. Open a Pull Request on GitHub
## Styleguides ## Styleguides
@ -32,7 +33,7 @@ ### Git Commit Messages
- Use the present tense ("Add feature" not "Added feature") - Use the present tense ("Add feature" not "Added feature")
- Use the imperative mood ("Move cursor to..." not "Moves cursor to...") - Use the imperative mood ("Move cursor to..." not "Moves cursor to...")
- Limit the first line to 72 characters or fewer - Limit the first line to 72 characters or less
### Commit Types: ### Commit Types:

View File

@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier same "printed page" as the copyright notice for easier
identification within third-party archives. identification within third-party archives.
Copyright (c) 2024-2025 leafspark Copyright 2024 leafspark
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.

109
README.md
View File

@ -9,85 +9,57 @@ # AutoGGUF - automated GGUF model quantizer
<!-- Project Info --> <!-- Project Info -->
[![Powered by llama.cpp](https://img.shields.io/badge/Powered%20by-llama.cpp-green.svg)](https://github.com/ggerganov/llama.cpp) [![Powered by llama.cpp](https://img.shields.io/badge/Powered%20by-llama.cpp-green.svg)](https://github.com/ggerganov/llama.cpp)
![GitHub top language](https://img.shields.io/github/languages/top/leafspark/AutoGGUF.svg)
[![Platform Compatibility](https://img.shields.io/badge/platform-Linux%20%7C%20macOS%20%7C%20Windows-blue)]() [![Platform Compatibility](https://img.shields.io/badge/platform-Linux%20%7C%20macOS%20%7C%20Windows-blue)]()
[![GitHub license](https://img.shields.io/github/license/leafspark/AutoGGUF.svg)](https://github.com/leafspark/AutoGGUF/blob/main/LICENSE) [![GitHub license](https://img.shields.io/github/license/leafspark/AutoGGUF.svg)](https://github.com/leafspark/AutoGGUF/blob/main/LICENSE)
![GitHub top language](https://img.shields.io/github/languages/top/leafspark/AutoGGUF.svg)
<!-- Repository Stats --> <!-- Repository Stats -->
![GitHub stars](https://img.shields.io/github/stars/leafspark/AutoGGUF.svg) ![GitHub stars](https://img.shields.io/github/stars/leafspark/AutoGGUF.svg)
![GitHub forks](https://img.shields.io/github/forks/leafspark/AutoGGUF.svg) ![GitHub forks](https://img.shields.io/github/forks/leafspark/AutoGGUF.svg)
![GitHub release (latest by date)](https://img.shields.io/github/downloads/leafspark/AutoGGUF/latest/total?color=green) ![GitHub release (latest by date)](https://img.shields.io/github/downloads/leafspark/AutoGGUF/latest/total?color=green)
![GitHub repo size](https://img.shields.io/github/repo-size/leafspark/AutoGGUF.svg) ![GitHub repo size](https://img.shields.io/github/repo-size/leafspark/AutoGGUF.svg)
<!-- ![Lines of Code](https://ghloc.vercel.app/leafspark/AutoGGUF?filter=.bat$,.py$,.sh$,.bat$) -->
<!-- Contribution --> <!-- Contribution -->
[![Issues](https://img.shields.io/github/issues/leafspark/AutoGGUF)](https://github.com/leafspark/AutoGGUF/issues)
[![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![Issues](https://img.shields.io/github/issues/leafspark/AutoGGUF)](https://github.com/leafspark/AutoGGUF/issues)
[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/leafspark/AutoGGUF/pulls) [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/leafspark/AutoGGUF/pulls)
The most comprehensive GUI tool for GGUF model quantization. Stop wrestling with command lines - quantize, merge, and optimize your models with just a few clicks. AutoGGUF provides a graphical user interface for quantizing GGUF models using the llama.cpp library. It allows users to download different versions of llama.cpp, manage multiple backends, and perform quantization tasks with various options.
## Features ## Features
- 📩 Update and manage llama.cpp backends - Download and manage llama.cpp backends
- 🗃️ Download and quantize GGUF/safetensors models - Select and quantize GGUF models
- 📐 Configure quantization parameters - Configure quantization parameters
- 💻 Monitor system resources in real time during quantization - Monitor system resources during quantization
- ⏳ Parallel quantization + imatrix generation - Parallel quantization + imatrix generation
- 🎉 LoRA conversion and merging - LoRA conversion and merging
- 📁 Preset saving and loading
- 8⃣ AutoFP8 quantization
- 🪓 GGUF splitting and merging
- 🌐 HTTP API for automation and monitoring
## Why AutoGGUF? ## Usage
- Fast: Saves time on manual configuration
- Simple: Clean UI, no terminal needed
- Powerful: Handles models up to infinite size, only limited by your RAM
- Resource-aware: Optimized memory management and efficient UI library
![AutoGGUF-v1 8 1-showcase-blue](https://github.com/user-attachments/assets/b136ccc3-5983-4266-9e66-00cebf3ca590) ### Cross-platform
1. Install dependencies:
## Quick Start
### Cross-platform (recommended)
1. `git clone https://github.com/leafspark/AutoGGUF`
2. `cd AutoGGUF`
3. Install dependencies:
``` ```
pip install -r requirements.txt pip install -r requirements.txt
``` ```
4. Run the application: 2. Run the application:
``` ```
python src/main.py python src/main.py
``` ```
or use the `run.bat` script. or use the `run.bat` script.
macOS and Ubuntu builds are provided with GitHub Actions, you may download the binaries in the releases section. ### Windows
### Windows (for the impatient)
Standard builds:
1. Download the latest release 1. Download the latest release
2. Extract all files to a folder 2. Extract all files to a folder
3. Run `AutoGGUF-x64.exe` 3. Run `AutoGGUF.exe`
4. Any necessary folders will be automatically created
Setup builds:
1. Download the setup variant of latest release
2. Extract all files to a folder
3. Run the setup program
4. The .gguf extension will be registered with the program automatically
5. Run the program from the Start Menu or desktop shortcuts
After launching the program, you may access its local server at port 7001 (set `AUTOGGUF_SERVER` to "enabled" first).
### Verifying Releases ### Verifying Releases
#### Linux/macOS: #### Linux/macOS:
```bash ```bash
gpg --import AutoGGUF-v1.5.0-prerel.asc gpg --import AutoGGUF-v1.5.0-prerel.asc
gpg --verify AutoGGUF-v1.9.1-Windows-avx2.zip.sig AutoGGUF-v1.9.1-Windows-avx2.zip gpg --verify AutoGGUF-v1.5.0-Windows-avx2-prerel.zip.sig AutoGGUF-v1.5.0-Windows-avx2-prerel.zip
sha256sum -c AutoGGUF-v1.9.1.sha256 sha256sum -c AutoGGUF-v1.5.0-prerel.sha256
``` ```
#### Windows (PowerShell): #### Windows (PowerShell):
@ -96,11 +68,11 @@ # Import the public key
gpg --import AutoGGUF-v1.5.0-prerel.asc gpg --import AutoGGUF-v1.5.0-prerel.asc
# Verify the signature # Verify the signature
gpg --verify AutoGGUF-v1.9.1-Windows-avx2.zip.sig AutoGGUF-v1.9.1-Windows-avx2.zip gpg --verify AutoGGUF-v1.5.0-Windows-avx2-prerel.zip.sig AutoGGUF-v1.5.0-Windows-avx2-prerel.zip
# Check SHA256 # Check SHA256
$fileHash = (Get-FileHash -Algorithm SHA256 AutoGGUF-v1.9.1-Windows-avx2.zip).Hash.ToLower() $fileHash = (Get-FileHash -Algorithm SHA256 AutoGGUF-v1.5.0-Windows-avx2-prerel.zip).Hash.ToLower()
$storedHash = (Get-Content AutoGGUF-v1.9.1.sha256 | Select-String AutoGGUF-v1.9.1-Windows-avx2.zip).Line.Split()[0] $storedHash = (Get-Content AutoGGUF-v1.5.0-prerel.sha256 | Select-String AutoGGUF-v1.5.0-Windows-avx2-prerel.zip).Line.Split()[0]
if ($fileHash -eq $storedHash) { "SHA256 Match" } else { "SHA256 Mismatch" } if ($fileHash -eq $storedHash) { "SHA256 Match" } else { "SHA256 Mismatch" }
``` ```
@ -118,53 +90,48 @@ ### Cross-platform
### Windows ### Windows
```bash ```bash
pip install -U pyinstaller
build RELEASE | DEV build RELEASE | DEV
``` ```
Find the executable in `build/<type>/dist/AutoGGUF-x64.exe`. Find the executable in `build/<type>/dist/AutoGGUF.exe`.
You can also use Nuitka, which may result in a slower build but a faster output executable: You can also use the slower build but faster executable script (Nuitka):
```bash ```bash
build_optimized RELEASE | DEV build_optimized RELEASE | DEV
``` ```
## Dependencies
Find them in `requirements.txt`.
## Localizations ## Localizations
View the list of supported languages at [AutoGGUF/wiki/Installation#configuration](https://github.com/leafspark/AutoGGUF/wiki/Installation#configuration) (LLM translated, except for English). View the list of supported languages at [AutoGGUF/wiki/Installation#configuration](https://github.com/leafspark/AutoGGUF/wiki/Installation#configuration) (LLM translated, except for English).
Languages will be updated as soon as possible after an update, or as a part of the update. To use a specific language, set the `AUTOGGUF_LANGUAGE` environment variable to one of the listed language codes (note: some languages may not be fully supported yet, those will fallback to English).
To use a specific language, set the `AUTOGGUF_LANGUAGE` environment variable to one of the listed language codes (note: some languages may not be fully supported yet, in which the UI elements will fall back to English). ## Known Issues
## Issues - ~~Cannot delete task while processing (planned fix: disallow deletion before cancelling or cancel automatically)~~ (fixed in v1.6.2)
- Some inconsistent logging and signal handling
- Missing or duplicated translations (priority)
- Buggy/incomplete API interfaces
- Code review and formatting (priority)
## Planned Features ## Planned Features
- [ ] Time estimation for quantization - Time estimation for quantization
- [ ] Quantization file size estimate - Actual progress bar tracking
- [ ] Perplexity testing - Perplexity testing
- [ ] bitsandbytes support - Web API and management (partially implemented in v1.6.2)
#### Project Status ## Troubleshooting
AutoGGUF has now entered maintenance mode. It's considered stable and feature-complete for most use cases, so I'm not actively developing new features, but Ill continue to publish occasional builds, update dependencies regularly, and fix critical bugs as needed. If you encounter issues or have suggestions, feel free to open an issue.
## Support
- SSL module cannot be found error: Install OpenSSL or run from source using `python src/main.py` with the `run.bat` script (`pip install requests`) - SSL module cannot be found error: Install OpenSSL or run from source using `python src/main.py` with the `run.bat` script (`pip install requests`)
- Check out the [Wiki](https://github.com/leafspark/AutoGGUF/wiki) for advanced usage and configuration
## Contributing ## Contributing
Fork the repo, make your changes, and ensure you have the latest commits when merging. Include a changelog of new features in your pull request description. Read `CONTRIBUTING.md` for more information. Fork the repo, make your changes, and ensure you have the latest commits when merging. Include a changelog of new features in your pull request description. Read `CONTRIBUTING.md` for more information.
## User Interface
![rsz_1autogguf-v162-screenshot-blue](https://github.com/user-attachments/assets/0e69dd3d-95b0-4bc6-b29e-df308bf027c4)
## Stargazers ## Stargazers
[![Star History Chart](https://api.star-history.com/svg?repos=leafspark/AutoGGUF&type=Date)](https://star-history.com/#leafspark/AutoGGUF&Date) [![Star History Chart](https://api.star-history.com/svg?repos=leafspark/AutoGGUF&type=Date)](https://star-history.com/#leafspark/AutoGGUF&Date)
`Last Updated: May 24, 2025`

View File

@ -3,11 +3,9 @@ # Security Policy
## Supported Versions ## Supported Versions
| Version | Supported | | Version | Supported |
|-----------------|--------------------| | ----------------- | ------------------ |
| stable (v2.0.x) | :white_check_mark: | | stable (v1.6.2) | :white_check_mark: |
Beta versions are not officially supported and may contain unknown security vulnerabilities. Use them at your own risk.
## Reporting a Vulnerability ## Reporting a Vulnerability
Use the Issues tab, or for severe vulnerabilities, please contact the maintainers via email. Use the Issues tab.

View File

@ -1,81 +0,0 @@
#define MyAppName "AutoGGUF"
#define MyAppVersion "v1.7.1"
#define MyAppPublisher "leafspark"
#define MyAppURL "https://github.com/leafspark/AutoGGUF"
#define MyAppExeName "AutoGGUF-x64.exe"
#define MyAppAssocName MyAppName + " File"
#define MyAppAssocExt ".gguf"
#define MyAppAssocKey StringChange(MyAppAssocName, " ", "") + MyAppAssocExt
[Setup]
; NOTE: The value of AppId uniquely identifies this application. Do not use the same AppId value in installers for other applications.
; (To generate a new GUID, click Tools | Generate GUID inside the IDE.)
AppId={{9753D5EB-05A8-489B-86A4-FCE6341FDE0E}
AppName={#MyAppName}
AppVersion={#MyAppVersion}
;AppVerName={#MyAppName} {#MyAppVersion}
AppPublisher={#MyAppPublisher}
AppPublisherURL={#MyAppURL}
AppSupportURL={#MyAppURL}
AppUpdatesURL={#MyAppURL}
DefaultDirName={autopf}\{#MyAppName}
; "ArchitecturesAllowed=x64compatible" specifies that Setup cannot run
; on anything but x64 and Windows 11 on Arm.
ArchitecturesAllowed=x64compatible
; "ArchitecturesInstallIn64BitMode=x64compatible" requests that the
; install be done in "64-bit mode" on x64 or Windows 11 on Arm,
; meaning it should use the native 64-bit Program Files directory and
; the 64-bit view of the registry.
ArchitecturesInstallIn64BitMode=x64compatible
ChangesAssociations=yes
DisableProgramGroupPage=yes
LicenseFile=F:\autogguf-release\LICENSE.txt
; Remove the following line to run in administrative install mode (install for all users.)
PrivilegesRequired=lowest
PrivilegesRequiredOverridesAllowed=dialog
OutputDir=E:\Downloads\autogguf-inno
OutputBaseFilename=autogguf
Compression=lzma
SolidCompression=yes
WizardStyle=modern
[Languages]
Name: "english"; MessagesFile: "compiler:Default.isl"
Name: "brazilianportuguese"; MessagesFile: "compiler:Languages\BrazilianPortuguese.isl"
Name: "dutch"; MessagesFile: "compiler:Languages\Dutch.isl"
Name: "finnish"; MessagesFile: "compiler:Languages\Finnish.isl"
Name: "french"; MessagesFile: "compiler:Languages\French.isl"
Name: "german"; MessagesFile: "compiler:Languages\German.isl"
Name: "hungarian"; MessagesFile: "compiler:Languages\Hungarian.isl"
Name: "italian"; MessagesFile: "compiler:Languages\Italian.isl"
Name: "japanese"; MessagesFile: "compiler:Languages\Japanese.isl"
Name: "korean"; MessagesFile: "compiler:Languages\Korean.isl"
Name: "polish"; MessagesFile: "compiler:Languages\Polish.isl"
Name: "portuguese"; MessagesFile: "compiler:Languages\Portuguese.isl"
Name: "russian"; MessagesFile: "compiler:Languages\Russian.isl"
Name: "spanish"; MessagesFile: "compiler:Languages\Spanish.isl"
Name: "turkish"; MessagesFile: "compiler:Languages\Turkish.isl"
Name: "ukrainian"; MessagesFile: "compiler:Languages\Ukrainian.isl"
[Tasks]
Name: "desktopicon"; Description: "{cm:CreateDesktopIcon}"; GroupDescription: "{cm:AdditionalIcons}"; Flags: unchecked
[Files]
Source: "F:\autogguf-release\AutoGGUF-v1.7.1-Windows-avx2-standard\{#MyAppExeName}"; DestDir: "{app}"; Flags: ignoreversion
Source: "F:\autogguf-release\AutoGGUF-v1.7.1-Windows-avx2-standard\src\*"; DestDir: "{app}"; Flags: ignoreversion recursesubdirs createallsubdirs
; NOTE: Don't use "Flags: ignoreversion" on any shared system files
[Registry]
Root: HKA; Subkey: "Software\Classes\{#MyAppAssocExt}\OpenWithProgids"; ValueType: string; ValueName: "{#MyAppAssocKey}"; ValueData: ""; Flags: uninsdeletevalue
Root: HKA; Subkey: "Software\Classes\{#MyAppAssocKey}"; ValueType: string; ValueName: ""; ValueData: "{#MyAppAssocName}"; Flags: uninsdeletekey
Root: HKA; Subkey: "Software\Classes\{#MyAppAssocKey}\DefaultIcon"; ValueType: string; ValueName: ""; ValueData: "{app}\{#MyAppExeName},0"
Root: HKA; Subkey: "Software\Classes\{#MyAppAssocKey}\shell\open\command"; ValueType: string; ValueName: ""; ValueData: """{app}\{#MyAppExeName}"" ""%1"""
Root: HKA; Subkey: "Software\Classes\Applications\{#MyAppExeName}\SupportedTypes"; ValueType: string; ValueName: ".myp"; ValueData: ""
[Icons]
Name: "{autoprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"
Name: "{autodesktop}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; Tasks: desktopicon
[Run]
Filename: "{app}\{#MyAppExeName}"; Description: "{cm:LaunchProgram,{#StringChange(MyAppName, '&', '&&')}}"; Flags: nowait postinstall skipifsilent

View File

@ -1 +0,0 @@
/* Leave this file blank for default theme */

BIN
assets/icon.RES Normal file

Binary file not shown.

1
assets/icon.rc Normal file
View File

@ -0,0 +1 @@
IDI_ICON1 ICON "favicon.ico"

View File

@ -1,20 +1,33 @@
#!/bin/bash #!/bin/bash
if [ $# -eq 0 ]; then if [ $# -eq 0 ]; then
echo "Usage: build.sh [RELEASE|DEV]" echo "Usage: $0 [RELEASE|DEV]"
exit 1 exit 1
fi fi
if [ "${1,,}" = "release" ]; then BUILD_TYPE=$1
echo "Building RELEASE version..." ICON_PATH="../../assets/favicon_large.png"
pyinstaller --windowed --onefile --name=AutoGGUF --icon=../../assets/favicon_large.png --add-data "../../assets:assets" --distpath=build/release/dist --workpath=build/release/build --specpath=build/release src/main.py ASSETS_PATH="../../assets"
elif [ "${1,,}" = "dev" ]; then SRC_PATH="src/main.py"
echo "Building DEV version..."
pyinstaller --onefile --name=AutoGGUF --icon=../../assets/favicon_large.png --add-data "../../assets:assets" --distpath=build/dev/dist --workpath=build/dev/build --specpath=build/dev src/main.py case $BUILD_TYPE in
else RELEASE)
echo "Invalid argument. Use RELEASE or DEV." OUTPUT_DIR="build/release"
EXTRA_ARGS="--windowed"
;;
DEV)
OUTPUT_DIR="build/dev"
EXTRA_ARGS=""
;;
*)
echo "Invalid build type. Use RELEASE or DEV."
exit 1 exit 1
fi ;;
esac
echo "Building $BUILD_TYPE version..."
pyinstaller $EXTRA_ARGS --onefile --name=AutoGGUF --icon=$ICON_PATH --add-data "$ASSETS_PATH:assets" --distpath=$OUTPUT_DIR/dist --workpath=$OUTPUT_DIR/build --specpath=$OUTPUT_DIR $SRC_PATH
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Build failed." echo "Build failed."

View File

@ -1,7 +1,7 @@
@echo off @echo off
if "%1"=="" ( if "%1"=="" (
echo Usage: build_optimized.bat [RELEASE^|DEV] echo Usage: build_fast.bat [RELEASE^|DEV]
exit /b 1 exit /b 1
) )

View File

@ -1,26 +0,0 @@
#!/bin/bash
if [ -z "$1" ]; then
echo "Usage: build_fast.sh [RELEASE|DEV]"
exit 1
fi
COMMON_FLAGS="--standalone --enable-plugin=pyside6 --include-data-dir=assets=assets"
if [ "$1" == "RELEASE" ]; then
echo "Building RELEASE version..."
python -m nuitka $COMMON_FLAGS --windows-console-mode=disable --output-dir=build/release src/main.py --lto=yes
elif [ "$1" == "DEV" ]; then
echo "Building DEV version..."
python -m nuitka $COMMON_FLAGS --output-dir=build/dev src/main.py
else
echo "Invalid argument. Use RELEASE or DEV."
exit 1
fi
if [ $? -ne 0 ]; then
echo "Build failed."
exit 1
else
echo "Build completed successfully."
fi

View File

@ -1,319 +1,145 @@
import importlib
import json
import re
import shutil
from datetime import datetime
from functools import partial
from typing import Any, Dict, List, Tuple
import requests
from PySide6.QtCore import *
from PySide6.QtGui import *
from PySide6.QtWidgets import *
from dotenv import load_dotenv
import lora_conversion
import presets
import ui_update
import utils
from CustomTitleBar import CustomTitleBar
from GPUMonitor import GPUMonitor
from Localizations import *
from Logger import Logger
from QuantizationThread import QuantizationThread
from TaskListItem import TaskListItem
from error_handling import handle_error, show_error
from imports_and_globals import (
ensure_directory,
open_file_safe,
resource_path,
show_about,
)
class CustomTitleBar(QWidget):
"""
Custom title bar for the main window, providing drag-and-drop functionality
and minimize/close buttons.
"""
def __init__(self, parent=None):
"""
Initializes the custom title bar.
Args:
parent (QWidget, optional): The parent widget. Defaults to None.
"""
class AutoGGUF(QMainWindow): class AutoGGUF(QMainWindow):
""" """
Main application window for AutoGGUF, providing a user interface for AutoGGUF is a PySide6-based graphical user interface for managing and quantizing large language models.
quantizing and converting large language models.
This class provides functionality for:
- Loading and displaying models (including sharded models)
- Quantizing models with various options
- Downloading llama.cpp releases
- Generating importance matrices
- Converting and exporting LoRA models
- Managing quantization tasks
- Converting Hugging Face models to GGUF format
The GUI allows users to interact with these features in an intuitive way, providing
options for model selection, quantization parameters, and task management.
Attributes:
logger (Logger): Instance of the Logger class for logging operations.
ram_bar (QProgressBar): Progress bar for displaying RAM usage.
cpu_label (QLabel): Label for displaying CPU usage.
gpu_monitor (GPUMonitor): Widget for monitoring GPU usage.
backend_combo (QComboBox): Dropdown for selecting the backend.
model_tree (QTreeWidget): Tree widget for displaying available models.
task_list (QListWidget): List widget for displaying ongoing tasks.
quant_threads (list): List to store active quantization threads.
The class also contains numerous UI elements for user input and interaction,
including text inputs, checkboxes, and buttons for various operations.
""" """
def __init__(self): def __init__(self):
""" """
Initializes the main window, setting up the UI, logger, and other Initialize the AutoGGUF application window.
necessary components.
"""
def keyPressEvent(self, event): This method sets up the main window, initializes the UI components,
""" sets up layouts, and connects various signals to their respective slots.
Handles key press events for window resizing. It also initializes the logger, sets up the system info update timer,
and prepares the application for model management and quantization tasks.
Args: The initialization process includes:
event (QKeyEvent): The key press event. - Setting up the main window properties (title, icon, size)
""" - Creating and arranging UI components for different functionalities
- Initializing backend and release information
def resize_window(self, larger): - Setting up file browsers for various inputs
""" - Preparing quantization options and task management interface
Resizes the window by a specified factor. - Initializing iMatrix generation interface
- Setting up LoRA conversion and export interfaces
Args: - Preparing Hugging Face to GGUF conversion interface
larger (bool): Whether to make the window larger or smaller.
"""
def reset_size(self):
"""Resets the window to its default size."""
def parse_resolution(self):
"""
Parses the resolution from the AUTOGGUF_RESOLUTION environment variable.
Returns:
tuple: The width and height of the window.
"""
def resizeEvent(self, event):
"""
Handles resize events to maintain rounded corners.
Args:
event (QResizeEvent): The resize event.
""" """
def refresh_backends(self): def refresh_backends(self):
"""Refreshes the list of available backends."""
def save_task_preset(self, task_item):
""" """
Saves the preset for a specific task. Refresh the list of available backends.
Args: This method scans the 'llama_bin' directory for valid backends,
task_item (TaskListItem): The task item to save the preset for. updates the backend selection combo box, and enables/disables
it based on the availability of backends.
The method logs the refresh operation and the number of valid
backends found.
""" """
def browse_export_lora_model(self): def update_assets(self):
"""Opens a file dialog to browse for the export LORA model file."""
def browse_export_lora_output(self):
"""Opens a file dialog to browse for the export LORA output file."""
def add_lora_adapter(self):
"""Adds a LORA adapter to the export LORA list."""
def browse_base_model(self):
"""Opens a file dialog to browse for the base model folder."""
def delete_lora_adapter_item(self, adapter_widget):
""" """
Deletes a LORA adapter item from the export LORA list. Update the list of assets for the selected llama.cpp release.
Args: This method clears the current asset list and populates it with
adapter_widget (QWidget): The widget containing the adapter information. the assets of the selected release. It also updates the CUDA
option visibility based on the selected asset.
""" """
def browse_hf_model_input(self): def download_llama_cpp(self):
"""Opens a file dialog to browse for the HuggingFace model directory."""
def browse_hf_outfile(self):
"""Opens a file dialog to browse for the HuggingFace to GGUF output file."""
def convert_hf_to_gguf(self):
"""Converts a HuggingFace model to GGUF format."""
def export_lora(self):
"""Exports a LORA from a GGML model."""
def restart_task(self, task_item):
""" """
Restarts a specific task. Initiate the download of the selected llama.cpp release asset.
Args: This method starts a download thread for the selected asset,
task_item (TaskListItem): The task item to restart. updates the UI to show download progress, and sets up signal
""" connections for download completion and error handling.
def lora_conversion_finished(self, thread, input_path, output_path):
"""
Handles the completion of a LORA conversion task.
Args:
thread (QuantizationThread): The thread that handled the conversion.
input_path (str): The path to the input LORA file.
output_path (str): The path to the output GGML file.
"""
def download_finished(self, extract_dir):
"""
Handles the completion of a download, extracting files and updating the UI.
Args:
extract_dir (str): The directory where the downloaded files were extracted.
"""
def extract_cuda_files(self, extract_dir, destination):
"""
Extracts CUDA files from a downloaded archive.
Args:
extract_dir (str): The directory where the downloaded files were extracted.
destination (str): The destination directory for the CUDA files.
"""
def download_error(self, error_message):
"""
Handles download errors, displaying an error message and cleaning up.
Args:
error_message (str): The error message.
"""
def show_task_context_menu(self, position):
"""
Shows the context menu for a task item in the task list.
Args:
position (QPoint): The position of the context menu.
"""
def show_task_properties(self, item):
"""
Shows the properties dialog for a specific task.
Args:
item (QListWidgetItem): The task item.
"""
def toggle_gpu_offload_auto(self, state):
"""
Toggles the automatic GPU offload option.
Args:
state (Qt.CheckState): The state of the checkbox.
"""
def cancel_task_by_item(self, item):
"""
Cancels a task by its item in the task list.
Args:
item (QListWidgetItem): The task item.
"""
def cancel_task(self, item):
"""
Cancels a specific task.
Args:
item (QListWidgetItem): The task item.
"""
def delete_task(self, item):
"""
Deletes a specific task.
Args:
item (QListWidgetItem): The task item.
"""
def create_label(self, text, tooltip):
"""
Creates a QLabel with a tooltip.
Args:
text (str): The text for the label.
tooltip (str): The tooltip for the label.
Returns:
QLabel: The created label.
""" """
def load_models(self): def load_models(self):
"""Loads the available models and displays them in the model tree.""" """
Load and display the list of available models.
def browse_models(self): This method scans the specified models directory for .gguf files,
"""Opens a file dialog to browse for the models directory.""" organizes them into sharded and single models, and populates
the model tree widget with this information.
def browse_output(self): """
"""Opens a file dialog to browse for the output directory."""
def browse_logs(self):
"""Opens a file dialog to browse for the logs directory."""
def browse_imatrix(self):
"""Opens a file dialog to browse for the imatrix file."""
def validate_quantization_inputs(self):
"""Validates the inputs for quantization."""
def add_kv_override(self, override_string=None):
"""Adds a KV override entry to the list."""
def remove_kv_override(self, entry):
"""Removes a KV override entry from the list."""
def quantize_model(self): def quantize_model(self):
"""Quantizes the selected model."""
def parse_progress(self, line, task_item):
""" """
Parses the progress from the output line and updates the task item. Start the quantization process for the selected model.
Args: This method prepares the quantization command based on user-selected
line (str): The output line. options, creates a new quantization thread, and sets up a task item
task_item (TaskListItem): The task item. in the task list to track the quantization progress.
""" """
def task_finished(self, thread, task_item):
"""
Handles the completion of a task.
Args:
thread (QuantizationThread): The thread that handled the task.
task_item (TaskListItem): The task item.
"""
def show_task_details(self, item):
"""
Shows the details of a specific task.
Args:
item (QListWidgetItem): The task item.
"""
def browse_imatrix_datafile(self):
"""Opens a file dialog to browse for the imatrix data file."""
def browse_imatrix_model(self):
"""Opens a file dialog to browse for the imatrix model file."""
def browse_imatrix_output(self):
"""Opens a file dialog to browse for the imatrix output file."""
def get_models_data(self):
"""Retrieves data for all loaded models."""
def get_tasks_data(self):
"""Retrieves data for all tasks in the task list."""
def generate_imatrix(self): def generate_imatrix(self):
"""Generates an imatrix file.""" """
Start the importance matrix generation process.
This method prepares the iMatrix generation command based on user inputs,
creates a new thread for the operation, and sets up a task item
in the task list to track the generation progress.
"""
def convert_lora(self):
"""
Start the LoRA conversion process.
This method prepares the LoRA conversion command based on user inputs,
creates a new thread for the conversion, and sets up a task item
in the task list to track the conversion progress.
"""
def export_lora(self):
"""
Start the LoRA export process.
This method prepares the LoRA export command based on user inputs,
creates a new thread for the export operation, and sets up a task item
in the task list to track the export progress.
"""
def convert_hf_to_gguf(self):
"""
Start the process of converting a Hugging Face model to GGUF format.
This method prepares the conversion command based on user inputs,
creates a new thread for the conversion, and sets up a task item
in the task list to track the conversion progress.
"""
def closeEvent(self, event: QCloseEvent): def closeEvent(self, event: QCloseEvent):
""" """
Handles close events, prompting the user if there are running tasks. Handle the window close event.
This method is called when the user attempts to close the application.
It checks for any running tasks and prompts the user for confirmation
before closing if tasks are still in progress.
Args: Args:
event (QCloseEvent): The close event. event (QCloseEvent): The close event object.
""" """

View File

@ -1,44 +0,0 @@
import os
import zipfile
import requests
from PySide6.QtCore import QThread, Signal
class DownloadThread(QThread):
"""
A QThread subclass for downloading and extracting zip files.
This thread downloads a file from a given URL, saves it to a specified path,
extracts its contents if it's a zip file, and then removes the original zip file.
Signals:
progress_signal (int): Emits the download progress as a percentage.
finished_signal (str): Emits the path of the extracted directory upon successful completion.
error_signal (str): Emits an error message if an exception occurs during the process.
"""
def __init__(self, url: str, save_path: str) -> None:
"""
Initialize the DownloadThread.
Args:
url (str): The URL of the file to download.
save_path (str): The local path where the file will be saved.
"""
def run(self) -> None:
"""
Execute the download, extraction, and cleanup process.
This method performs the following steps:
1. Downloads the file from the specified URL.
2. Saves the file to the specified path.
3. Extracts the contents if it's a zip file.
4. Removes the original zip file after extraction.
5. Emits signals for progress updates, completion, or errors.
Raises:
Exception: Any exception that occurs during the process is caught
and emitted through the error_signal.
"""

View File

@ -1,56 +0,0 @@
class Logger:
"""
This module provides a custom logger class for logging messages to both the console and a rotating log file.
The log file will be created in the specified `log_dir` with a timestamp in the filename.
The file will rotate when it reaches 10MB, keeping a maximum of 5 backup files.
"""
def __init__(self, name, log_dir):
"""
Initializes the logger with a specified name and log directory.
Args:
name (str): The name of the logger.
log_dir (str): The directory where log files will be stored.
"""
def debug(self, message):
"""
Logs a message with the DEBUG level.
Args:
message (str): The message to log.
"""
def info(self, message):
"""
Logs a message with the INFO level.
Args:
message (str): The message to log.
"""
def warning(self, message):
"""
Logs a message with the WARNING level.
Args:
message (str): The message to log.
"""
def error(self, message):
"""
Logs a message with the ERROR level.
Args:
message (str): The message to log.
"""
def critical(self, message):
"""
Logs a message with the CRITICAL level.
Args:
message (str): The message to log.
"""

View File

@ -1,28 +0,0 @@
class ModelInfoDialog(QDialog):
"""
A dialog window for displaying model information.
This class creates a dialog that shows detailed information about a machine learning model,
including its architecture, quantization type, and other relevant data.
Attributes:
None
Args:
model_info (dict): A dictionary containing the model's information.
parent (QWidget, optional): The parent widget of this dialog. Defaults to None.
"""
def format_model_info(self, model_info) -> str:
"""
Formats the model information into HTML for display.
This method takes the raw model information and converts it into a formatted HTML string,
which can be displayed in the dialog's QTextEdit widget.
Args:
model_info (dict): A dictionary containing the model's information.
Returns:
str: Formatted HTML string containing the model information.
"""

View File

@ -1,17 +0,0 @@
def convert_lora(self):
"""Converts a LORA file to either GGML or GGUF format.
This function initiates the conversion process based on user input,
utilizing a separate thread for the actual conversion and providing
progress updates in the UI.
It validates input paths, constructs the conversion command, creates
a log file, manages the conversion thread, and handles errors.
Args:
self: The object instance.
Raises:
ValueError: If required input paths are missing.
"""

View File

@ -1,13 +0,0 @@
class ExamplePlugin:
def init(self, autogguf_instance):
# This gets called after the plugin is loaded
print("Plugin initialized")
def __data__(self):
return {
"name": "ExamplePlugin",
"description": "This is an example plugin.",
"compatible_versions": ["*"],
"author": "leafspark",
"version": "v1.0.0",
}

View File

@ -1,14 +1,9 @@
PyYAML~=6.0.2 psutil~=6.0.0
psutil~=7.0.0 requests~=2.32.3
pynvml~=12.0.0
PySide6~=6.9.1
safetensors~=0.5.3
numpy<2.0.0 numpy<2.0.0
torch~=2.7.0 torch~=2.4.0
sentencepiece~=0.2.0 sentencepiece~=0.2.0
setuptools~=80.7.1 PyYAML~=6.0.2
huggingface-hub~=0.33.1 pynvml~=11.5.3
transformers~=4.51.3 PySide6~=6.7.2
fastapi~=0.115.12 flask~=3.0.3
uvicorn~=0.34.2
certifi~=2025.4.26

29
run.sh
View File

@ -1,31 +1,6 @@
#!/bin/sh #!/bin/sh
# Check if Python is installed
if ! command -v python3 >/dev/null 2>&1; then
echo "Error: Python 3 is not installed or not in the PATH."
echo "Please install Python 3 and try again."
exit 1
fi
# Set environment variables
export PYTHONIOENCODING=utf-8 export PYTHONIOENCODING=utf-8
export AUTOGGUF_LANGUAGE=en-US export AUTOGGUF_LANGUAGE=en-US
export AUTOGGUF_CHECK_BACKEND=disabled
# Try to run main.py in the current directory python3 src/main.py
if [ -f "main.py" ]; then
echo "Running main.py in the current directory..."
python3 main.py
exit 0
fi
# If main.py doesn't exist in the current directory, try src/main.py
if [ -f "src/main.py" ]; then
echo "Running src/main.py..."
python3 src/main.py
exit 0
fi
# If neither file is found, display an error message
echo "Error: Neither main.py nor src/main.py found."
echo "Please make sure the script is in the correct directory."
exit 1

View File

@ -1,17 +0,0 @@
from setuptools import setup
with open("requirements.txt") as f:
required = f.read().splitlines()
setup(
name="AutoGGUF",
version="v2.0.1",
packages=[""],
url="https://github.com/leafspark/AutoGGUF",
license="apache-2.0",
author="leafspark",
author_email="leafspark@proton.me",
description="automatically quant GGUF models",
install_requires=required,
entry_points={"console_scripts": ["autogguf-gui = main:main"]},
)

File diff suppressed because it is too large Load Diff

View File

@ -1,112 +0,0 @@
from PySide6.QtCore import QPoint, Qt
from PySide6.QtWidgets import QHBoxLayout, QLabel, QMenuBar, QPushButton, QWidget
class CustomTitleBar(QWidget):
def __init__(self, parent=None) -> None:
super().__init__(parent)
self.parent = parent
layout = QHBoxLayout(self)
layout.setContentsMargins(10, 5, 10, 5)
# Add the favicon
# TODO: uncomment this
# self.icon_label = QLabel()
# self.icon_label.setPixmap(QPixmap(resource_path("assets/favicon.ico")))
# layout.addWidget(self.icon_label)
# Add app title (bolded)
self.title = QLabel("<b>AutoGGUF</b>") # Use HTML tags for bolding
layout.addWidget(self.title)
# Add menubar here
self.menubar = QMenuBar()
layout.addWidget(self.menubar) # Add menubar to the layout
layout.addStretch(1) # This pushes the buttons to the right
# Add minimize and close buttons
self.minimize_button = QPushButton("")
self.close_button = QPushButton("")
for button in (self.minimize_button, self.close_button):
button.setFixedSize(30, 30)
button.setStyleSheet(
"""
QPushButton {
border: none;
background-color: transparent;
}
QPushButton:hover {
background-color: rgba(255, 255, 255, 0.1);
}
"""
)
# Enable mouse tracking for smoother movement
self.setMouseTracking(True)
# Add maximize button
self.maximize_button = QPushButton("")
self.maximize_button.setFixedSize(30, 30)
self.maximize_button.setStyleSheet(
"""
QPushButton {
border: none;
background-color: transparent;
padding: 2px;
font-size: 15px;
}
QPushButton:hover {
background-color: rgba(255, 255, 255, 0.1);
}
"""
)
self.maximize_button.clicked.connect(self.toggle_maximize)
layout.addWidget(self.minimize_button)
layout.addWidget(self.maximize_button)
layout.addWidget(self.close_button)
self.minimize_button.clicked.connect(self.parent.showMinimized)
self.close_button.clicked.connect(self.parent.close)
self.start = QPoint(0, 0)
self.pressing = False
self.isMaximized = False # Flag to track maximization state
self.normal_size = None # Store the normal window size
def mousePressEvent(self, event) -> None:
if event.button() == Qt.LeftButton:
self.start = event.globalPos() - self.parent.frameGeometry().topLeft()
self.pressing = True
def mouseMoveEvent(self, event) -> None:
if self.pressing:
new_pos = event.globalPos() - self.start
screen = self.parent.screen()
screen_geo = screen.availableGeometry()
# Check if the new position would put the titlebar below the taskbar
if (
new_pos.y() + self.parent.height() > screen_geo.bottom()
): # Use screen_geo.bottom()
new_pos.setY(screen_geo.bottom() - self.parent.height())
self.parent.move(new_pos)
def mouseReleaseEvent(self, event) -> None:
self.pressing = False
def toggle_maximize(self) -> None:
if self.isMaximized:
self.parent.showNormal()
if self.normal_size:
self.parent.resize(self.normal_size)
self.maximize_button.setText("") # Change back to maximize symbol
self.isMaximized = False
else:
self.normal_size = self.parent.size() # Store the current size
self.parent.showMaximized()
self.maximize_button.setText("") # Change to restore symbol
self.isMaximized = True

View File

@ -1,10 +1,8 @@
import os import os
import urllib.request
import urllib.error
import zipfile import zipfile
import ssl
import certifi import requests
from PySide6.QtCore import QThread, Signal from PySide6.QtCore import *
class DownloadThread(QThread): class DownloadThread(QThread):
@ -12,33 +10,21 @@ class DownloadThread(QThread):
finished_signal = Signal(str) finished_signal = Signal(str)
error_signal = Signal(str) error_signal = Signal(str)
def __init__(self, url, save_path) -> None: def __init__(self, url, save_path):
super().__init__() super().__init__()
self.url = url self.url = url
self.save_path = save_path self.save_path = save_path
def run(self) -> None: def run(self):
try: try:
req = urllib.request.Request(self.url) response = requests.get(self.url, stream=True)
response.raise_for_status()
# Create SSL context with certifi certificates total_size = int(response.headers.get("content-length", 0))
ssl_context = ssl.create_default_context(cafile=certifi.where())
with urllib.request.urlopen(req, context=ssl_context) as response:
if response.status != 200:
raise urllib.error.HTTPError(
self.url, response.status, "HTTP Error", response.headers, None
)
total_size = int(response.headers.get("Content-Length", 0))
block_size = 8192 block_size = 8192
downloaded = 0 downloaded = 0
with open(self.save_path, "wb") as file: with open(self.save_path, "wb") as file:
while True: for data in response.iter_content(block_size):
data = response.read(block_size)
if not data:
break
size = file.write(data) size = file.write(data)
downloaded += size downloaded += size
if total_size: if total_size:

View File

@ -22,15 +22,11 @@
VRAM_USAGE_OVER_TIME, VRAM_USAGE_OVER_TIME,
NO_GPU_DETECTED, NO_GPU_DETECTED,
AMD_GPU_NOT_SUPPORTED, AMD_GPU_NOT_SUPPORTED,
CPU_USAGE_OVER_TIME,
RAM_USAGE_OVER_TIME,
) )
from ui_update import animate_bar
class SimpleGraph(QGraphicsView): class SimpleGraph(QGraphicsView):
def __init__(self, title, parent=None) -> None: def __init__(self, title, parent=None):
super().__init__(parent) super().__init__(parent)
self.setScene(QGraphicsScene(self)) self.setScene(QGraphicsScene(self))
self.setRenderHint(QPainter.RenderHint.Antialiasing) self.setRenderHint(QPainter.RenderHint.Antialiasing)
@ -39,7 +35,7 @@ def __init__(self, title, parent=None) -> None:
self.title = title self.title = title
self.data = [] self.data = []
def update_data(self, data) -> None: def update_data(self, data):
self.data = data self.data = data
self.scene().clear() self.scene().clear()
if not self.data: if not self.data:
@ -67,13 +63,13 @@ def update_data(self, data) -> None:
line.setPen(path) line.setPen(path)
self.scene().addItem(line) self.scene().addItem(line)
def resizeEvent(self, event) -> None: def resizeEvent(self, event):
super().resizeEvent(event) super().resizeEvent(event)
self.update_data(self.data) self.update_data(self.data)
class GPUMonitor(QWidget): class GPUMonitor(QWidget):
def __init__(self, parent=None) -> None: def __init__(self, parent=None):
super().__init__(parent) super().__init__(parent)
self.setMinimumHeight(30) self.setMinimumHeight(30)
self.setMaximumHeight(30) self.setMaximumHeight(30)
@ -95,7 +91,7 @@ def __init__(self, parent=None) -> None:
self.timer = QTimer(self) self.timer = QTimer(self)
self.timer.timeout.connect(self.update_gpu_info) self.timer.timeout.connect(self.update_gpu_info)
self.timer.start(500) # Update every 0.5 seconds self.timer.start(200) # Update every 0.2 seconds
self.gpu_data = [] self.gpu_data = []
self.vram_data = [] self.vram_data = []
@ -127,17 +123,17 @@ def __init__(self, parent=None) -> None:
if not self.handles: if not self.handles:
self.gpu_label.setText(NO_GPU_DETECTED) self.gpu_label.setText(NO_GPU_DETECTED)
def check_for_amd_gpu(self) -> None: def check_for_amd_gpu(self):
# This is a placeholder. Implementing AMD GPU detection would require # This is a placeholder. Implementing AMD GPU detection would require
# platform-specific methods or additional libraries. # platform-specific methods or additional libraries.
self.gpu_label.setText(AMD_GPU_NOT_SUPPORTED) self.gpu_label.setText(AMD_GPU_NOT_SUPPORTED)
def change_gpu(self, index) -> None: def change_gpu(self, index):
self.current_gpu = index self.current_gpu = index
self.gpu_data.clear() self.gpu_data.clear()
self.vram_data.clear() self.vram_data.clear()
def update_gpu_info(self) -> None: def update_gpu_info(self):
if self.handles: if self.handles:
try: try:
handle = self.handles[self.current_gpu] handle = self.handles[self.current_gpu]
@ -147,7 +143,7 @@ def update_gpu_info(self) -> None:
gpu_usage = utilization.gpu gpu_usage = utilization.gpu
vram_usage = (memory.used / memory.total) * 100 vram_usage = (memory.used / memory.total) * 100
animate_bar(self, self.gpu_bar, int(vram_usage)) self.gpu_bar.setValue(int(vram_usage))
self.gpu_label.setText( self.gpu_label.setText(
GPU_USAGE_FORMAT.format( GPU_USAGE_FORMAT.format(
gpu_usage, gpu_usage,
@ -167,36 +163,11 @@ def update_gpu_info(self) -> None:
self.gpu_bar.setValue(0) self.gpu_bar.setValue(0)
self.gpu_label.setText(GPU_USAGE_FORMAT.format(0, 0, 0, 0)) self.gpu_label.setText(GPU_USAGE_FORMAT.format(0, 0, 0, 0))
def mouseDoubleClickEvent(self, event) -> None: def mouseDoubleClickEvent(self, event):
if self.handles: if self.handles:
self.show_detailed_stats() self.show_detailed_stats()
def show_ram_graph(self, event) -> None: def show_detailed_stats(self):
self.show_detailed_stats_std(RAM_USAGE_OVER_TIME, self.ram_data)
def show_cpu_graph(self, event) -> None:
self.show_detailed_stats_std(CPU_USAGE_OVER_TIME, self.cpu_data)
def show_detailed_stats_std(self, title, data) -> None:
dialog = QDialog(self)
dialog.setWindowTitle(title)
dialog.setMinimumSize(800, 600)
layout = QVBoxLayout(dialog)
graph = SimpleGraph(title)
layout.addWidget(graph)
def update_graph_data() -> None:
graph.update_data(data)
timer = QTimer(dialog)
timer.timeout.connect(update_graph_data)
timer.start(500) # Update every 0.5 seconds
dialog.exec()
def show_detailed_stats(self) -> None:
dialog = QDialog(self) dialog = QDialog(self)
dialog.setWindowTitle(GPU_DETAILS) dialog.setWindowTitle(GPU_DETAILS)
dialog.setMinimumSize(800, 600) dialog.setMinimumSize(800, 600)
@ -221,20 +192,15 @@ def show_detailed_stats(self) -> None:
gpu_graph = SimpleGraph(GPU_USAGE_OVER_TIME) gpu_graph = SimpleGraph(GPU_USAGE_OVER_TIME)
vram_graph = SimpleGraph(VRAM_USAGE_OVER_TIME) vram_graph = SimpleGraph(VRAM_USAGE_OVER_TIME)
def update_graph_data() -> None:
gpu_graph.update_data(self.gpu_data) gpu_graph.update_data(self.gpu_data)
vram_graph.update_data(self.vram_data) vram_graph.update_data(self.vram_data)
timer = QTimer(dialog)
timer.timeout.connect(update_graph_data)
timer.start(500) # Update every 0.5 seconds
tab_widget.addTab(gpu_graph, GPU_USAGE_OVER_TIME) tab_widget.addTab(gpu_graph, GPU_USAGE_OVER_TIME)
tab_widget.addTab(vram_graph, VRAM_USAGE_OVER_TIME) tab_widget.addTab(vram_graph, VRAM_USAGE_OVER_TIME)
dialog.exec() dialog.exec()
def closeEvent(self, event) -> None: def closeEvent(self, event):
if self.handles: if self.handles:
pynvml.nvmlShutdown() pynvml.nvmlShutdown()
super().closeEvent(event) super().closeEvent(event)

View File

@ -1,42 +1,37 @@
import locale from PySide6.QtWidgets import QWidget, QHBoxLayout, QLineEdit, QComboBox, QPushButton
import os from PySide6.QtCore import Signal, QRegularExpression
import platform
import shutil
import socket
import time
from datetime import datetime
import psutil
from PySide6.QtCore import QRegularExpression, Signal
from PySide6.QtGui import QDoubleValidator, QIntValidator, QRegularExpressionValidator from PySide6.QtGui import QDoubleValidator, QIntValidator, QRegularExpressionValidator
from PySide6.QtWidgets import QComboBox, QHBoxLayout, QLineEdit, QPushButton, QWidget from datetime import datetime
import time
import os
import socket
import platform
class KVOverrideEntry(QWidget): class KVOverrideEntry(QWidget):
deleted = Signal(QWidget) deleted = Signal(QWidget)
def __init__(self, parent=None) -> None: def __init__(self, parent=None):
super().__init__(parent) super().__init__(parent)
layout = QHBoxLayout(self) layout = QHBoxLayout(self)
layout.setContentsMargins(0, 0, 0, 0) layout.setContentsMargins(0, 0, 0, 0)
self.key_input = QLineEdit() self.key_input = QLineEdit()
self.key_input.setPlaceholderText("Key") self.key_input.setPlaceholderText("Key")
# Set validator for key input (letters and dots only) # Set validator for key input (letters and dots only)
key_validator = QRegularExpressionValidator(QRegularExpression(r"[A-Za-z.]+")) key_validator = QRegularExpressionValidator(QRegularExpression(r"[A-Za-z.]+"))
self.key_input.setValidator(key_validator) self.key_input.setValidator(key_validator)
layout.addWidget(self.key_input) layout.addWidget(self.key_input)
self.type_combo = QComboBox() self.type_combo = QComboBox()
self.type_combo.addItems(["int", "str", "float", "u32", "i32"]) self.type_combo.addItems(["int", "str", "float"])
layout.addWidget(self.type_combo) layout.addWidget(self.type_combo)
self.value_input = QLineEdit() self.value_input = QLineEdit()
self.value_input.setPlaceholderText("Value") self.value_input.setPlaceholderText("Value")
layout.addWidget(self.value_input) layout.addWidget(self.value_input)
delete_button = QPushButton("") delete_button = QPushButton("X")
delete_button.setFixedSize(30, 30) delete_button.setFixedSize(30, 30)
delete_button.clicked.connect(self.delete_clicked) delete_button.clicked.connect(self.delete_clicked)
layout.addWidget(delete_button) layout.addWidget(delete_button)
@ -47,16 +42,12 @@ def __init__(self, parent=None) -> None:
# Initialize validator # Initialize validator
self.update_validator(self.type_combo.currentText()) self.update_validator(self.type_combo.currentText())
def delete_clicked(self) -> None: def delete_clicked(self):
self.deleted.emit(self) self.deleted.emit(self)
def get_override_string( def get_override_string(
self, self, model_name=None, quant_type=None, output_path=None
model_name=None, ): # Add arguments
quant_type=None,
output_path=None,
quantization_parameters=None,
) -> str: # Add arguments
key = self.key_input.text() key = self.key_input.text()
type_ = self.type_combo.currentText() type_ = self.type_combo.currentText()
value = self.value_input.text() value = self.value_input.text()
@ -70,14 +61,8 @@ def get_override_string(
"{system.hostname}": lambda: socket.gethostname(), "{system.hostname}": lambda: socket.gethostname(),
"{system.platform}": lambda: platform.system(), "{system.platform}": lambda: platform.system(),
"{system.python.version}": lambda: platform.python_version(), "{system.python.version}": lambda: platform.python_version(),
"{system.timezone}": lambda: time.tzname[time.daylight], "{system.time.milliseconds}": lambda: str(int(time.time() * 1000)),
"{system.cpus}": lambda: str(os.cpu_count()), "{system.date}": lambda: datetime.now().strftime("%Y-%m-%d"),
"{system.memory.total}": lambda: str(psutil.virtual_memory().total),
"{system.memory.free}": lambda: str(psutil.virtual_memory().free),
"{system.filesystem.used}": lambda: str(shutil.disk_usage("/").used),
"{system.kernel.version}": lambda: platform.release(),
"{system.locale}": lambda: locale.getdefaultlocale()[0],
"{process.nice}": lambda: str(os.nice(0)),
"{model.name}": lambda: ( "{model.name}": lambda: (
model_name if model_name is not None else "Unknown Model" model_name if model_name is not None else "Unknown Model"
), ),
@ -87,21 +72,6 @@ def get_override_string(
"{output.path}": lambda: ( "{output.path}": lambda: (
output_path if output_path is not None else "Unknown Output Path" output_path if output_path is not None else "Unknown Output Path"
), ),
"{quant.kv}": lambda: (
quantization_parameters[0]
if quantization_parameters is not None
else False
),
"{quant.requantized}": lambda: (
quantization_parameters[1]
if quantization_parameters is not None
else False
),
"{quant.leave_output_tensor}": lambda: (
quantization_parameters[2]
if quantization_parameters is not None
else False
),
} }
for param, func in dynamic_params.items(): for param, func in dynamic_params.items():
@ -109,11 +79,11 @@ def get_override_string(
return f"{key}={type_}:{value}" return f"{key}={type_}:{value}"
def get_raw_override_string(self) -> str: def get_raw_override_string(self):
# Return the raw override string with placeholders intact # Return the raw override string with placeholders intact
return f"{self.key_input.text()}={self.type_combo.currentText()}:{self.value_input.text()}" return f"{self.key_input.text()}={self.type_combo.currentText()}:{self.value_input.text()}"
def update_validator(self, type_) -> None: def update_validator(self, type_):
if type_ == "int": if type_ == "int":
self.value_input.setValidator(QIntValidator()) self.value_input.setValidator(QIntValidator())
elif type_ == "float": elif type_ == "float":

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,7 @@
class Logger: class Logger:
def __init__(self, name, log_dir) -> None: def __init__(self, name, log_dir):
self.logger = logging.getLogger(name) self.logger = logging.getLogger(name)
self.logger.setLevel(logging.DEBUG) self.logger.setLevel(logging.DEBUG)
@ -34,17 +34,17 @@ def __init__(self, name, log_dir) -> None:
self.logger.addHandler(console_handler) self.logger.addHandler(console_handler)
self.logger.addHandler(file_handler) self.logger.addHandler(file_handler)
def debug(self, message) -> None: def debug(self, message):
self.logger.debug(message) self.logger.debug(message)
def info(self, message) -> None: def info(self, message):
self.logger.info(message) self.logger.info(message)
def warning(self, message) -> None: def warning(self, message):
self.logger.warning(message) self.logger.warning(message)
def error(self, message) -> None: def error(self, message):
self.logger.error(message) self.logger.error(message)
def critical(self, message) -> None: def critical(self, message):
self.logger.critical(message) self.logger.critical(message)

View File

@ -1,8 +1,8 @@
from PySide6.QtWidgets import QVBoxLayout, QTextEdit, QDialog, QPushButton from PySide6.QtWidgets import *
class ModelInfoDialog(QDialog): class ModelInfoDialog(QDialog):
def __init__(self, model_info, parent=None) -> None: def __init__(self, model_info, parent=None):
super().__init__(parent) super().__init__(parent)
self.setWindowTitle("Model Information") self.setWindowTitle("Model Information")
self.setGeometry(200, 200, 600, 400) self.setGeometry(200, 200, 600, 400)
@ -21,24 +21,11 @@ def __init__(self, model_info, parent=None) -> None:
self.setLayout(layout) self.setLayout(layout)
def format_model_info(self, model_info) -> str: def format_model_info(self, model_info):
html = "<h2>Model Information</h2>" html = "<h2>Model Information</h2>"
html += f"<p><b>Architecture:</b> {model_info.get('architecture', 'N/A')}</p>" html += f"<p><b>Architecture:</b> {model_info.get('architecture', 'N/A')}</p>"
html += f"<p><b>Quantization Type:</b> {model_info.get('quantization_type', 'N/A')}</p>"
# Format quantization types html += f"<p><b>KV Pairs:</b> {model_info.get('kv_pairs', 'N/A')}</p>"
quant_types = model_info.get("quantization_type", [])
if quant_types:
# Clean up the format: remove "- type " prefix and join with " | "
formatted_types = []
for qtype in quant_types:
# Remove "- type " prefix if present
clean_type = qtype.replace("- type ", "").strip()
formatted_types.append(clean_type)
quant_display = " | ".join(formatted_types)
else:
quant_display = "N/A"
html += f"<p><b>Quantization Type:</b> {quant_display}</p>"
html += f"<p><b>Tensors:</b> {model_info.get('tensors', 'N/A')}</p>" html += f"<p><b>Tensors:</b> {model_info.get('tensors', 'N/A')}</p>"
html += "<h3>Key-Value Pairs:</h3>" html += "<h3>Key-Value Pairs:</h3>"

View File

@ -1,81 +0,0 @@
import importlib
import os
from typing import Any, Dict
from Localizations import *
class Plugins:
def load_plugins(self) -> Dict[str, Dict[str, Any]]:
plugins = {}
plugin_dir = "plugins"
if not os.path.exists(plugin_dir):
self.logger.info(PLUGINS_DIR_NOT_EXIST.format(plugin_dir))
return plugins
if not os.path.isdir(plugin_dir):
self.logger.warning(PLUGINS_DIR_NOT_DIRECTORY.format(plugin_dir))
return plugins
for file in os.listdir(plugin_dir):
if file.endswith(".py") and not file.endswith(".disabled.py"):
name = file[:-3]
path = os.path.join(plugin_dir, file)
try:
spec = importlib.util.spec_from_file_location(name, path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
for item_name in dir(module):
item = getattr(module, item_name)
if isinstance(item, type) and hasattr(item, "__data__"):
plugin_instance = item()
plugin_data = plugin_instance.__data__()
compatible_versions = plugin_data.get(
"compatible_versions", []
)
if (
"*" in compatible_versions
or AUTOGGUF_VERSION in compatible_versions
):
plugins[name] = {
"instance": plugin_instance,
"data": plugin_data,
}
self.logger.info(
PLUGIN_LOADED.format(
plugin_data["name"], plugin_data["version"]
)
)
else:
self.logger.warning(
PLUGIN_INCOMPATIBLE.format(
plugin_data["name"],
plugin_data["version"],
AUTOGGUF_VERSION,
", ".join(compatible_versions),
)
)
break
except Exception as e:
self.logger.error(PLUGIN_LOAD_FAILED.format(name, str(e)))
return plugins
def apply_plugins(self) -> None:
if not self.plugins:
self.logger.info(NO_PLUGINS_LOADED)
return
for plugin_name, plugin_info in self.plugins.items():
plugin_instance = plugin_info["instance"]
for attr_name in dir(plugin_instance):
if not attr_name.startswith("__") and attr_name != "init":
attr_value = getattr(plugin_instance, attr_name)
setattr(self, attr_name, attr_value)
if hasattr(plugin_instance, "init") and callable(plugin_instance.init):
plugin_instance.init(self)

View File

@ -1,12 +1,10 @@
import os
import re
import signal import signal
import subprocess import subprocess
from PySide6.QtCore import Signal, QThread from PySide6.QtCore import *
from globals import open_file_safe from imports_and_globals import open_file_safe
from Localizations import IN_PROGRESS, COMPLETED from Localizations import *
class QuantizationThread(QThread): class QuantizationThread(QThread):
@ -17,7 +15,7 @@ class QuantizationThread(QThread):
error_signal = Signal(str) error_signal = Signal(str)
model_info_signal = Signal(dict) model_info_signal = Signal(dict)
def __init__(self, command, cwd, log_file) -> None: def __init__(self, command, cwd, log_file):
super().__init__() super().__init__()
self.command = command self.command = command
self.cwd = cwd self.cwd = cwd
@ -25,7 +23,7 @@ def __init__(self, command, cwd, log_file) -> None:
self.process = None self.process = None
self.model_info = {} self.model_info = {}
def run(self) -> None: def run(self):
try: try:
# Start the subprocess # Start the subprocess
self.process = subprocess.Popen( self.process = subprocess.Popen(
@ -58,35 +56,7 @@ def run(self) -> None:
except Exception as e: except Exception as e:
self.error_signal.emit(str(e)) self.error_signal.emit(str(e))
def parse_model_info(self, line) -> None: def parse_model_info(self, line):
# Mapping of technical keys to human-readable names
key_mappings = {
"general.architecture": "Architecture",
"general.name": "Model Name",
"general.file_type": "File Type",
"general.quantization_version": "Quantization Version",
"llama.block_count": "Layers",
"llama.context_length": "Context Length",
"llama.embedding_length": "Embedding Size",
"llama.feed_forward_length": "Feed Forward Length",
"llama.attention.head_count": "Attention Heads",
"llama.attention.head_count_kv": "Key-Value Heads",
"llama.attention.layer_norm_rms_epsilon": "RMS Norm Epsilon",
"llama.rope.freq_base": "RoPE Frequency Base",
"llama.rope.dimension_count": "RoPE Dimensions",
"llama.vocab_size": "Vocabulary Size",
"tokenizer.ggml.model": "Tokenizer Model",
"tokenizer.ggml.pre": "Tokenizer Preprocessing",
"tokenizer.ggml.tokens": "Tokens",
"tokenizer.ggml.token_type": "Token Types",
"tokenizer.ggml.merges": "BPE Merges",
"tokenizer.ggml.bos_token_id": "Begin of Sequence Token ID",
"tokenizer.ggml.eos_token_id": "End of Sequence Token ID",
"tokenizer.chat_template": "Chat Template",
"tokenizer.ggml.padding_token_id": "Padding Token ID",
"tokenizer.ggml.unk_token_id": "Unknown Token ID",
}
# Parse output for model information # Parse output for model information
if "llama_model_loader: loaded meta data with" in line: if "llama_model_loader: loaded meta data with" in line:
parts = line.split() parts = line.split()
@ -94,25 +64,10 @@ def parse_model_info(self, line) -> None:
self.model_info["tensors"] = parts[9] self.model_info["tensors"] = parts[9]
elif "general.architecture" in line: elif "general.architecture" in line:
self.model_info["architecture"] = line.split("=")[-1].strip() self.model_info["architecture"] = line.split("=")[-1].strip()
elif line.startswith("llama_model_loader: - kv") and "=" in line: elif line.startswith("llama_model_loader: - kv"):
# Split on '=' and take the parts key = line.split(":")[2].strip()
parts = line.split("=", 1) # Split only on first '=' value = line.split("=")[-1].strip()
left_part = parts[0].strip() self.model_info.setdefault("kv_data", {})[key] = value
value = parts[1].strip()
# Extract key and type from left part
# Format: "llama_model_loader: - kv N: key type"
kv_parts = left_part.split(":")
if len(kv_parts) >= 3:
key_type_part = kv_parts[2].strip() # This is "key type"
key = key_type_part.rsplit(" ", 1)[
0
] # Everything except last word (type)
# Use human-readable name if available, otherwise use original key
display_key = key_mappings.get(key, key)
self.model_info.setdefault("kv_data", {})[display_key] = value
elif line.startswith("llama_model_loader: - type"): elif line.startswith("llama_model_loader: - type"):
parts = line.split(":") parts = line.split(":")
if len(parts) > 1: if len(parts) > 1:
@ -122,31 +77,7 @@ def parse_model_info(self, line) -> None:
f"{quant_type}: {tensors} tensors" f"{quant_type}: {tensors} tensors"
) )
def parse_progress(self, line, task_item, imatrix_chunks=None) -> None: def terminate(self):
# Parses the output line for progress information and updates the task item.
match = re.search(r"\[\s*(\d+)\s*/\s*(\d+)\s*].*", line)
if match:
current = int(match.group(1))
total = int(match.group(2))
progress = int((current / total) * 100)
task_item.update_progress(progress)
else:
imatrix_match = re.search(
r"compute_imatrix: computing over (\d+) chunks with batch_size \d+",
line,
)
if imatrix_match:
imatrix_chunks = int(imatrix_match.group(1))
elif imatrix_chunks is not None:
if "save_imatrix: stored collected data" in line:
save_match = re.search(r"collected data after (\d+) chunks", line)
if save_match:
saved_chunks = int(save_match.group(1))
progress = int((saved_chunks / self.imatrix_chunks) * 100)
task_item.update_progress(progress)
def terminate(self) -> None:
# Terminate the subprocess if it's still running # Terminate the subprocess if it's still running
if self.process: if self.process:
os.kill(self.process.pid, signal.SIGTERM) os.kill(self.process.pid, signal.SIGTERM)

View File

@ -1,52 +1,14 @@
from typing import List
from PySide6.QtCore import * from PySide6.QtCore import *
from PySide6.QtGui import QAction
from PySide6.QtWidgets import * from PySide6.QtWidgets import *
from Localizations import (
DELETING_TASK,
CANCELLING_TASK,
CONFIRM_DELETION_TITLE,
CONFIRM_DELETION,
SHOWING_TASK_CONTEXT_MENU,
CANCELED,
CANCEL,
PROPERTIES,
COMPLETED,
SHOWING_PROPERTIES_FOR_TASK,
DELETE,
RESTART,
IN_PROGRESS,
ERROR,
RESTARTING_TASK,
)
from ModelInfoDialog import ModelInfoDialog
from QuantizationThread import QuantizationThread
from Logger import Logger
from error_handling import handle_error
class TaskListItem(QWidget): class TaskListItem(QWidget):
def __init__( def __init__(self, task_name, log_file, show_progress_bar=True, parent=None):
self,
task_name,
log_file,
show_progress_bar=True,
parent=None,
show_properties=False,
logger=Logger,
quant_threads=List[QuantizationThread],
) -> None:
super().__init__(parent) super().__init__(parent)
self.quant_threads = quant_threads
self.task_name = task_name self.task_name = task_name
self.log_file = log_file self.log_file = log_file
self.logger = logger
self.show_properties = show_properties
self.status = "Pending" self.status = "Pending"
layout = QHBoxLayout(self) layout = QHBoxLayout(self)
self.task_label = QLabel(task_name) self.task_label = QLabel(task_name)
self.progress_bar = QProgressBar() self.progress_bar = QProgressBar()
self.progress_bar.setRange(0, 100) self.progress_bar.setRange(0, 100)
@ -66,136 +28,34 @@ def __init__(
self.progress_timer.timeout.connect(self.update_progress) self.progress_timer.timeout.connect(self.update_progress)
self.progress_value = 0 self.progress_value = 0
def show_task_context_menu(self, position) -> None: def update_status(self, status):
self.logger.debug(SHOWING_TASK_CONTEXT_MENU)
item = self.task_list.itemAt(position)
if item is not None:
context_menu = QMenu(self)
properties_action = QAction(PROPERTIES, self)
properties_action.triggered.connect(lambda: self.show_task_properties(item))
context_menu.addAction(properties_action)
task_item = self.task_list.itemWidget(item)
if task_item.status != COMPLETED:
cancel_action = QAction(CANCEL, self)
cancel_action.triggered.connect(lambda: self.cancel_task(item))
context_menu.addAction(cancel_action)
if task_item.status == CANCELED:
restart_action = QAction(RESTART, self)
restart_action.triggered.connect(lambda: self.restart_task(task_item))
context_menu.addAction(restart_action)
delete_action = QAction(DELETE, self)
delete_action.triggered.connect(lambda: self.delete_task(item))
context_menu.addAction(delete_action)
context_menu.exec(self.task_list.viewport().mapToGlobal(position))
def show_task_properties(self, item) -> None:
self.logger.debug(SHOWING_PROPERTIES_FOR_TASK.format(item.text()))
for thread in self.quant_threads:
model_info_dialog = ModelInfoDialog(thread.model_info, self)
model_info_dialog.exec()
break
def cancel_task(self, item) -> None:
# TODO: fix possibly buggy signal behavior
task_item = self.task_list.itemWidget(item)
if task_item:
task_name = task_item.task_name # Store the name before any changes
self.logger.info(CANCELLING_TASK.format(task_name))
# Find the thread and disconnect signals before terminating
for thread in self.quant_threads:
if thread.log_file == task_item.log_file:
# Disconnect all signals from this thread first
try:
thread.error_signal.disconnect() # Disconnect all error signal connections
thread.output_signal.disconnect() # Disconnect all output signal connections
except TypeError:
# No connections to disconnect
pass
# Now terminate the thread
thread.terminate()
self.quant_threads.remove(thread)
break
def delete_task(self, item) -> None:
task_item = self.task_list.itemWidget(item)
if not task_item:
return
task_name = task_item.task_name # Store task_name before deletion
self.logger.info(DELETING_TASK.format(task_name))
reply = QMessageBox.question(
self,
CONFIRM_DELETION_TITLE,
CONFIRM_DELETION,
QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No,
QMessageBox.StandardButton.No,
)
if reply == QMessageBox.StandardButton.Yes:
# Cancel the task first (which disconnects signals)
self.cancel_task(item)
# Now remove from list and delete
row = self.task_list.row(item)
self.task_list.takeItem(row)
# Delete the widget after removing from list
task_item.deleteLater()
def update_status(self, status) -> None:
self.status = status self.status = status
self.status_label.setText(status) self.status_label.setText(status)
if status == IN_PROGRESS: if status == "In Progress":
# Only start timer if showing percentage progress # Only start timer if showing percentage progress
if self.progress_bar.isVisible(): if self.progress_bar.isVisible():
self.progress_bar.setRange(0, 100) self.progress_bar.setRange(0, 100)
self.progress_timer.start(100) self.progress_timer.start(100)
elif status == COMPLETED: elif status == "Completed":
self.progress_timer.stop() self.progress_timer.stop()
self.progress_bar.setValue(100) self.progress_bar.setValue(100)
elif status == CANCELED: elif status == "Canceled":
self.progress_timer.stop() self.progress_timer.stop()
self.progress_bar.setValue(0) self.progress_bar.setValue(0)
def set_error(self) -> None: def set_error(self):
self.status = ERROR self.status = "Error"
self.status_label.setText(ERROR) self.status_label.setText("Error")
self.status_label.setStyleSheet("color: red;") self.status_label.setStyleSheet("color: red;")
self.progress_bar.setRange(0, 100) self.progress_bar.setRange(0, 100)
self.progress_timer.stop() self.progress_timer.stop()
def update_progress(self, value=None) -> None: def update_progress(self, value=None):
if value is not None: if value is not None:
# Update progress bar with specific value # Update progress bar with specific value
self.progress_value = value self.progress_value = value
self.progress_bar.setValue(self.progress_value) self.progress_bar.setValue(self.progress_value)
else: else:
return # Increment progress bar for indeterminate progress
self.progress_value = (self.progress_value + 1) % 101
def restart_task(self, task_item) -> None: self.progress_bar.setValue(self.progress_value)
self.logger.info(RESTARTING_TASK.format(task_item.task_name))
for thread in self.quant_threads:
if thread.log_file == task_item.log_file:
new_thread = QuantizationThread(
thread.command, thread.cwd, thread.log_file
)
self.quant_threads.append(new_thread)
new_thread.status_signal.connect(task_item.update_status)
new_thread.finished_signal.connect(
lambda: self.task_finished(new_thread, task_item)
)
new_thread.error_signal.connect(
lambda err: handle_error(self.logger, err, task_item)
)
new_thread.model_info_signal.connect(self.update_model_info)
new_thread.start()
task_item.update_status(IN_PROGRESS)
break

File diff suppressed because it is too large Load Diff

View File

@ -1,17 +1,19 @@
from __future__ import annotations from __future__ import annotations
import json
import logging import logging
import json
import os import os
import struct import struct
import sys import sys
from typing import BinaryIO from pathlib import Path
from typing import Any, BinaryIO, Sequence
import numpy as np import numpy as np
import torch import torch
from gguf.constants import * if "NO_LOCAL_GGUF" not in os.environ:
from gguf.tensor_mapping import * sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
import gguf
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("lora-to-gguf") logger = logging.getLogger("lora-to-gguf")
@ -49,20 +51,25 @@ def write_tensor_header(
fout.seek((fout.tell() + 31) & -32) fout.seek((fout.tell() + 31) & -32)
def pyinstaller_include():
# PyInstaller import
pass
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) < 2: if len(sys.argv) < 2:
logger.info(f"Usage: python {sys.argv[0]} <path> <output_path> [arch]") logger.info(f"Usage: python {sys.argv[0]} <path> [arch]")
logger.info( logger.info(
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
) )
logger.info( logger.info(
f"Arch must be one of {list(MODEL_ARCH_NAMES.values())} (default: llama)" f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)"
) )
sys.exit(1) sys.exit(1)
input_json = os.path.join(sys.argv[1], "adapter_config.json") input_json = os.path.join(sys.argv[1], "adapter_config.json")
input_model = os.path.join(sys.argv[1], "adapter_model.bin") input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = sys.argv[2] output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
if os.path.exists(input_model): if os.path.exists(input_model):
model = torch.load(input_model, map_location="cpu") model = torch.load(input_model, map_location="cpu")
@ -73,16 +80,16 @@ def write_tensor_header(
model = load_file(input_model, device="cpu") model = load_file(input_model, device="cpu")
arch_name = sys.argv[3] if len(sys.argv) == 4 else "llama" arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in MODEL_ARCH_NAMES.values(): if arch_name not in gguf.MODEL_ARCH_NAMES.values():
logger.error(f"Error: unsupported architecture {arch_name}") logger.error(f"Error: unsupported architecture {arch_name}")
sys.exit(1) sys.exit(1)
arch = list(MODEL_ARCH_NAMES.keys())[ arch = list(gguf.MODEL_ARCH_NAMES.keys())[
list(MODEL_ARCH_NAMES.values()).index(arch_name) list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)
] ]
name_map = TensorNameMap(arch, 500) name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
with open(input_json, "r") as f: with open(input_json, "r") as f:
params = json.load(f) params = json.load(f)

View File

@ -18,16 +18,18 @@
SupportsIndex, SupportsIndex,
cast, cast,
) )
from transformers import AutoConfig
import torch import torch
if TYPE_CHECKING: if TYPE_CHECKING:
from torch import Tensor from torch import Tensor
if "NO_LOCAL_GGUF" not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / "gguf-py"))
import gguf import gguf
# reuse model definitions from convert_hf_to_gguf.py # reuse model definitions from convert_hf_to_gguf.py
from convert_hf_to_gguf import LazyTorchTensor, ModelBase from convert_hf_to_gguf import LazyTorchTensor, Model
logger = logging.getLogger("lora-to-gguf") logger = logging.getLogger("lora-to-gguf")
@ -240,15 +242,17 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
base_name = lora_tensor_name.replace("base_model.model.", "") base_name = lora_tensor_name.replace("base_model.model.", "")
base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_A.weight", ".weight")
base_name = base_name.replace(".lora_B.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight")
# models produced by mergekit-extract-lora have token embeddings in the adapter
base_name = base_name.replace(".lora_embedding_A", ".weight")
base_name = base_name.replace(".lora_embedding_B", ".weight")
return base_name return base_name
def pyinstaller_include():
# PyInstaller import
pass
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file" description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file"
) )
parser.add_argument( parser.add_argument(
"--outfile", "--outfile",
@ -285,28 +289,18 @@ def parse_args() -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"--base", "--base",
type=Path, type=Path,
help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config", required=True,
) help="directory containing base model file",
parser.add_argument(
"--base-model-id",
type=str,
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
) )
parser.add_argument( parser.add_argument(
"lora_path", "lora_path",
type=Path, type=Path,
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", help="directory containing LoRA adapter file",
) )
return parser.parse_args() return parser.parse_args()
def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
# normally, adapter does not come with base model config, we need to load it from AutoConfig
config = AutoConfig.from_pretrained(hf_model_id)
return config.to_dict()
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
@ -321,9 +315,8 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
ftype = ftype_map[args.outtype] ftype = ftype_map[args.outtype]
dir_base_model: Path | None = args.base dir_base_model: Path = args.base
dir_lora: Path = args.lora_path dir_lora: Path = args.lora_path
base_model_id: str | None = args.base_model_id
lora_config = dir_lora / "adapter_config.json" lora_config = dir_lora / "adapter_config.json"
input_model = dir_lora / "adapter_model.safetensors" input_model = dir_lora / "adapter_model.safetensors"
@ -342,41 +335,12 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
input_model = os.path.join(dir_lora, "adapter_model.bin") input_model = os.path.join(dir_lora, "adapter_model.bin")
lora_model = torch.load(input_model, map_location="cpu", weights_only=True) lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
# load LoRA config
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)
# load base model # load base model
if base_model_id is not None:
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
hparams = load_hparams_from_hf(base_model_id)
elif dir_base_model is None:
if "base_model_name_or_path" in lparams:
model_id = lparams["base_model_name_or_path"]
logger.info(f"Loading base model from Hugging Face: {model_id}")
try:
hparams = load_hparams_from_hf(model_id)
except OSError as e:
logger.error(f"Failed to load base model config: {e}")
logger.error(
"Please try downloading the base model and add its path to --base"
)
sys.exit(1)
else:
logger.error(
"'base_model_name_or_path' is not found in adapter_config.json"
)
logger.error(
"Base model config is required. Please download the base model and add its path to --base"
)
sys.exit(1)
else:
logger.info(f"Loading base model: {dir_base_model.name}") logger.info(f"Loading base model: {dir_base_model.name}")
hparams = ModelBase.load_hparams(dir_base_model) hparams = Model.load_hparams(dir_base_model)
with torch.inference_mode(): with torch.inference_mode():
try: try:
model_class = ModelBase.from_model_architecture(hparams["architectures"][0]) model_class = Model.from_model_architecture(hparams["architectures"][0])
except NotImplementedError: except NotImplementedError:
logger.error(f"Model {hparams['architectures'][0]} is not supported") logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1) sys.exit(1)
@ -395,9 +359,6 @@ def __init__(
self.dir_model_card = dir_lora_model self.dir_model_card = dir_lora_model
self.lora_alpha = float(lora_alpha) self.lora_alpha = float(lora_alpha)
def set_vocab(self):
pass
def set_type(self): def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER) self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
@ -406,10 +367,7 @@ def set_gguf_parameters(self):
self.gguf_writer.add_float32( self.gguf_writer.add_float32(
gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha
) )
super().set_gguf_parameters()
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters
return ()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]: def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
tensor_map: dict[str, PartialLoraTensor] = {} tensor_map: dict[str, PartialLoraTensor] = {}
@ -418,26 +376,14 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
if self.lazy: if self.lazy:
tensor = LazyTorchTensor.from_eager(tensor) tensor = LazyTorchTensor.from_eager(tensor)
base_name = get_base_tensor_name(name) base_name = get_base_tensor_name(name)
# note: mergekit-extract-lora also adds token embeddings to the adapter is_lora_a = ".lora_A.weight" in name
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name is_lora_b = ".lora_B.weight" in name
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
if not is_lora_a and not is_lora_b: if not is_lora_a and not is_lora_b:
if ".base_layer.weight" in name: if ".base_layer.weight" in name:
continue continue
# mergekit-extract-lora add these layernorm to the adapter, we need to keep them
if "_layernorm" in name or ".norm" in name:
yield (base_name, tensor)
continue
logger.error( logger.error(
f"Unexpected name '{name}': Not a lora_A or lora_B tensor" f"Unexpected name '{name}': Not a lora_A or lora_B tensor"
) )
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
logger.error(
"Embeddings is present in the adapter. This can be due to new tokens added during fine tuning"
)
logger.error(
"Please refer to https://github.com/ggml-org/llama.cpp/pull/9948"
)
sys.exit(1) sys.exit(1)
if base_name in tensor_map: if base_name in tensor_map:
@ -462,34 +408,17 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
def modify_tensors( def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None self, data_torch: Tensor, name: str, bid: int | None
) -> Iterable[tuple[str, Tensor]]: ) -> Iterable[tuple[str, Tensor]]:
dest = list(super().modify_tensors(data_torch, name, bid)) dest = super().modify_tensors(data_torch, name, bid)
# some archs may have the same tensor for lm_head and output (tie word embeddings)
# in this case, adapters targeting lm_head will fail when using llama-export-lora
# therefore, we ignore them for now
# see: https://github.com/ggml-org/llama.cpp/issues/9065
if name == "lm_head.weight" and len(dest) == 0:
raise ValueError(
"lm_head is present in adapter, but is ignored in base model"
)
for dest_name, dest_data in dest: for dest_name, dest_data in dest:
# mergekit-extract-lora add these layernorm to the adapter
if "_norm" in dest_name:
assert dest_data.dim() == 1
yield (dest_name, dest_data)
continue
# otherwise, we must get the lora_A and lora_B tensors
assert isinstance(dest_data, LoraTorchTensor) assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B() lora_a, lora_b = dest_data.get_lora_A_B()
# note: mergekit-extract-lora flip and transpose A and B
# here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
if "token_embd.weight" in dest_name:
lora_a = lora_a.T
yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b) yield (dest_name + ".lora_b", lora_b)
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)
alpha: float = lparams["lora_alpha"] alpha: float = lparams["lora_alpha"]
model_instance = LoraModel( model_instance = LoraModel(
@ -502,7 +431,6 @@ def modify_tensors(
dry_run=args.dry_run, dry_run=args.dry_run,
dir_lora_model=dir_lora, dir_lora_model=dir_lora,
lora_alpha=alpha, lora_alpha=alpha,
hparams=hparams,
) )
logger.info("Exporting model...") logger.info("Exporting model...")

View File

@ -1,105 +0,0 @@
import argparse
import json
import sys
from pathlib import Path
import numpy as np
import torch
from safetensors.torch import save_file
import gguf
def dequantize_tensor(tensor):
if tensor.tensor_type in [
gguf.GGMLQuantizationType.F32,
gguf.GGMLQuantizationType.F16,
gguf.GGMLQuantizationType.BF16,
]:
return np.array(tensor.data)
else:
return tensor.data.astype(np.float32)
def gguf_to_safetensors(gguf_path, safetensors_path, metadata_path=None):
try:
reader = gguf.GGUFReader(gguf_path)
except Exception as e:
print(f"Error reading GGUF file: {e}", file=sys.stderr)
sys.exit(1)
tensors = {}
metadata = {}
for tensor in reader.tensors:
try:
dequantized_data = dequantize_tensor(tensor)
tensors[tensor.name] = torch.from_numpy(
dequantized_data.reshape(tuple(reversed(tensor.shape)))
)
except Exception as e:
print(f"Error processing tensor {tensor.name}: {e}", file=sys.stderr)
continue
for field_name, field in reader.fields.items():
if field.data:
metadata[field_name] = field.parts[field.data[0]].tolist()
try:
save_file(tensors, safetensors_path)
except Exception as e:
print(f"Error saving SafeTensors file: {e}", file=sys.stderr)
sys.exit(1)
decoded_metadata = {}
for key, value in metadata.items():
if isinstance(value, list) and all(isinstance(item, int) for item in value):
decoded_value = ""
for item in value:
if 48 <= item <= 57:
decoded_value += str(item - 48)
elif 32 <= item <= 126:
decoded_value += chr(item)
else:
decoded_value += str(item)
decoded_metadata[key] = decoded_value
else:
decoded_metadata[key] = value
if metadata_path:
try:
with open(metadata_path, "w") as f:
json.dump(decoded_metadata, f, indent=4)
except Exception as e:
print(f"Error saving metadata file: {e}", file=sys.stderr)
def main():
parser = argparse.ArgumentParser(description="Convert GGUF to SafeTensors format")
parser.add_argument("gguf_path", type=str, help="Path to the input GGUF file")
parser.add_argument(
"safetensors_path", type=str, help="Path to save the SafeTensors file"
)
parser.add_argument(
"--metadata_path",
type=str,
help="Optional path to save metadata as a JSON file",
)
args = parser.parse_args()
gguf_path = Path(args.gguf_path)
safetensors_path = Path(args.safetensors_path)
metadata_path = Path(args.metadata_path) if args.metadata_path else None
if not gguf_path.exists():
print(f"Error: GGUF file '{gguf_path}' does not exist.", file=sys.stderr)
sys.exit(1)
print(f"Converting {gguf_path} to {safetensors_path}")
gguf_to_safetensors(gguf_path, safetensors_path, metadata_path)
print("Conversion complete.")
if __name__ == "__main__":
main()

View File

@ -1,13 +1,13 @@
from PySide6.QtWidgets import QMessageBox from PySide6.QtWidgets import QMessageBox
from Localizations import ERROR_MESSAGE, ERROR, TASK_ERROR from Localizations import *
def show_error(logger, message) -> None: def show_error(logger, message):
logger.error(message) logger.error(ERROR_MESSAGE.format(message))
QMessageBox.critical(None, ERROR, message) QMessageBox.critical(None, ERROR, message)
def handle_error(logger, error_message, task_item) -> None: def handle_error(logger, error_message, task_item):
logger.error(TASK_ERROR.format(error_message)) logger.error(TASK_ERROR.format(error_message))
show_error(logger, error_message) show_error(logger, error_message)
task_item.update_status(ERROR) task_item.update_status(ERROR)

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,7 @@
# This file left for compatibility. If you want to use the GGUF API from Python
# then don't import gguf/gguf.py directly. If you're looking for examples, see the
# examples/ directory for gguf-py
import importlib import importlib
import sys import sys
from pathlib import Path from pathlib import Path

View File

@ -1,8 +1,11 @@
#
# GGUF file reading/modification support. For API usage information,
# please see the files scripts/ for some fairly simple examples.
#
from __future__ import annotations from __future__ import annotations
import logging import logging
import os import os
import sys
from collections import OrderedDict from collections import OrderedDict
from typing import Any, Literal, NamedTuple, TypeVar, Union from typing import Any, Literal, NamedTuple, TypeVar, Union
@ -12,6 +15,7 @@
from .quants import quant_shape_to_byte_shape from .quants import quant_shape_to_byte_shape
if __name__ == "__main__": if __name__ == "__main__":
import sys
from pathlib import Path from pathlib import Path
# Allow running file in package as a script. # Allow running file in package as a script.
@ -24,7 +28,6 @@
GGUF_VERSION, GGUF_VERSION,
GGMLQuantizationType, GGMLQuantizationType,
GGUFValueType, GGUFValueType,
GGUFEndian,
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -50,52 +53,6 @@ class ReaderField(NamedTuple):
types: list[GGUFValueType] = [] types: list[GGUFValueType] = []
def contents(self, index_or_slice: int | slice = slice(None)) -> Any:
if self.types:
to_string = lambda x: str(x.tobytes(), encoding="utf-8") # noqa: E731
main_type = self.types[0]
if main_type == GGUFValueType.ARRAY:
sub_type = self.types[-1]
if sub_type == GGUFValueType.STRING:
indices = self.data[index_or_slice]
if isinstance(index_or_slice, int):
return to_string(self.parts[indices]) # type: ignore
else:
return [to_string(self.parts[idx]) for idx in indices] # type: ignore
else:
# FIXME: When/if _get_field_parts() support multi-dimensional arrays, this must do so too
# Check if it's unsafe to perform slice optimization on data
# if any(True for idx in self.data if len(self.parts[idx]) != 1):
# optim_slice = slice(None)
# else:
# optim_slice = index_or_slice
# index_or_slice = slice(None)
# if isinstance(optim_slice, int):
# return self.parts[self.data[optim_slice]].tolist()[0]
# else:
# return [pv for idx in self.data[optim_slice] for pv in self.parts[idx].tolist()][index_or_slice]
if isinstance(index_or_slice, int):
return self.parts[self.data[index_or_slice]].tolist()[0]
else:
return [
pv
for idx in self.data[index_or_slice]
for pv in self.parts[idx].tolist()
]
if main_type == GGUFValueType.STRING:
return to_string(self.parts[-1])
else:
return self.parts[-1].tolist()[0]
return None
class ReaderTensor(NamedTuple): class ReaderTensor(NamedTuple):
name: str name: str
@ -146,23 +103,12 @@ def __init__(
# If we get 0 here that means it's (probably) a GGUF file created for # If we get 0 here that means it's (probably) a GGUF file created for
# the opposite byte order of the machine this script is running on. # the opposite byte order of the machine this script is running on.
self.byte_order = "S" self.byte_order = "S"
temp_version = temp_version.view( temp_version = temp_version.newbyteorder(self.byte_order)
temp_version.dtype.newbyteorder(self.byte_order)
)
version = temp_version[0] version = temp_version[0]
if version not in READER_SUPPORTED_VERSIONS: if version not in READER_SUPPORTED_VERSIONS:
raise ValueError( raise ValueError(
f"Sorry, file appears to be version {version} which we cannot handle" f"Sorry, file appears to be version {version} which we cannot handle"
) )
if sys.byteorder == "little":
# Host is little endian
host_endian = GGUFEndian.LITTLE
swapped_endian = GGUFEndian.BIG
else:
# Sorry PDP or other weird systems that don't use BE or LE.
host_endian = GGUFEndian.BIG
swapped_endian = GGUFEndian.LITTLE
self.endianess = swapped_endian if self.byte_order == "S" else host_endian
self.fields: OrderedDict[str, ReaderField] = OrderedDict() self.fields: OrderedDict[str, ReaderField] = OrderedDict()
self.tensors: list[ReaderTensor] = [] self.tensors: list[ReaderTensor] = []
offs += self._push_field( offs += self._push_field(
@ -223,11 +169,10 @@ def _get(
count = int(count) count = int(count)
itemsize = int(np.empty([], dtype=dtype).itemsize) itemsize = int(np.empty([], dtype=dtype).itemsize)
end_offs = offset + itemsize * count end_offs = offset + itemsize * count
arr = self.data[offset:end_offs].view(dtype=dtype)[:count] return (
return arr.view( self.data[offset:end_offs]
arr.dtype.newbyteorder( .view(dtype=dtype)[:count]
self.byte_order if override_order is None else override_order .newbyteorder(override_order or self.byte_order)
)
) )
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
@ -274,7 +219,6 @@ def _get_field_parts(
offs += int(alen.nbytes) offs += int(alen.nbytes)
aparts: list[npt.NDArray[Any]] = [raw_itype, alen] aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
data_idxs: list[int] = [] data_idxs: list[int] = []
# FIXME: Handle multi-dimensional arrays properly instead of flattening
for idx in range(alen[0]): for idx in range(alen[0]):
curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts( curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(
offs, raw_itype[0] offs, raw_itype[0]

View File

@ -26,7 +26,6 @@
RopeScalingType, RopeScalingType,
PoolingType, PoolingType,
TokenType, TokenType,
ExpertGatingFuncType,
) )
from .quants import quant_shape_from_byte_shape from .quants import quant_shape_from_byte_shape
@ -642,11 +641,6 @@ def add_base_model_organization(self, source_id: int, organization: str) -> None
Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization
) )
def add_base_model_description(self, source_id: int, description: str) -> None:
self.add_string(
Keys.General.BASE_MODEL_DESCRIPTION.format(id=source_id), description
)
def add_base_model_url(self, source_id: int, url: str) -> None: def add_base_model_url(self, source_id: int, url: str) -> None:
self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url) self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url)
@ -659,46 +653,15 @@ def add_base_model_uuid(self, source_id: int, uuid: str) -> None:
def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None: def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None:
self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url) self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url)
def add_dataset_count(self, source_count: int) -> None:
self.add_uint32(Keys.General.DATASET_COUNT, source_count)
def add_dataset_name(self, source_id: int, name: str) -> None:
self.add_string(Keys.General.DATASET_NAME.format(id=source_id), name)
def add_dataset_author(self, source_id: int, author: str) -> None:
self.add_string(Keys.General.DATASET_AUTHOR.format(id=source_id), author)
def add_dataset_version(self, source_id: int, version: str) -> None:
self.add_string(Keys.General.DATASET_VERSION.format(id=source_id), version)
def add_dataset_organization(self, source_id: int, organization: str) -> None:
self.add_string(
Keys.General.DATASET_ORGANIZATION.format(id=source_id), organization
)
def add_dataset_description(self, source_id: int, description: str) -> None:
self.add_string(
Keys.General.DATASET_DESCRIPTION.format(id=source_id), description
)
def add_dataset_url(self, source_id: int, url: str) -> None:
self.add_string(Keys.General.DATASET_URL.format(id=source_id), url)
def add_dataset_doi(self, source_id: int, doi: str) -> None:
self.add_string(Keys.General.DATASET_DOI.format(id=source_id), doi)
def add_dataset_uuid(self, source_id: int, uuid: str) -> None:
self.add_string(Keys.General.DATASET_UUID.format(id=source_id), uuid)
def add_dataset_repo_url(self, source_id: int, repo_url: str) -> None:
self.add_string(Keys.General.DATASET_REPO_URL.format(id=source_id), repo_url)
def add_tags(self, tags: Sequence[str]) -> None: def add_tags(self, tags: Sequence[str]) -> None:
self.add_array(Keys.General.TAGS, tags) self.add_array(Keys.General.TAGS, tags)
def add_languages(self, languages: Sequence[str]) -> None: def add_languages(self, languages: Sequence[str]) -> None:
self.add_array(Keys.General.LANGUAGES, languages) self.add_array(Keys.General.LANGUAGES, languages)
def add_datasets(self, datasets: Sequence[str]) -> None:
self.add_array(Keys.General.DATASETS, datasets)
def add_tensor_data_layout(self, layout: str) -> None: def add_tensor_data_layout(self, layout: str) -> None:
self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
@ -711,21 +674,6 @@ def add_context_length(self, length: int) -> None:
def add_embedding_length(self, length: int) -> None: def add_embedding_length(self, length: int) -> None:
self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
def add_features_length(self, length: int) -> None:
self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
def add_posnet_embedding_length(self, length: int) -> None:
self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length)
def add_posnet_block_count(self, length: int) -> None:
self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length)
def add_convnext_embedding_length(self, length: int) -> None:
self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length)
def add_convnext_block_count(self, length: int) -> None:
self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)
def add_block_count(self, length: int) -> None: def add_block_count(self, length: int) -> None:
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
@ -774,12 +722,6 @@ def add_key_length(self, length: int) -> None:
def add_value_length(self, length: int) -> None: def add_value_length(self, length: int) -> None:
self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length) self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length)
def add_key_length_mla(self, length: int) -> None:
self.add_uint32(Keys.Attention.KEY_LENGTH_MLA.format(arch=self.arch), length)
def add_value_length_mla(self, length: int) -> None:
self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length)
def add_max_alibi_bias(self, bias: float) -> None: def add_max_alibi_bias(self, bias: float) -> None:
self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias) self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)
@ -807,56 +749,12 @@ def add_expert_shared_count(self, count: int) -> None:
def add_expert_weights_scale(self, value: float) -> None: def add_expert_weights_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value) self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
def add_expert_weights_norm(self, value: bool) -> None:
self.add_bool(Keys.LLM.EXPERT_WEIGHTS_NORM.format(arch=self.arch), value)
def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
def add_moe_every_n_layers(self, value: int) -> None:
self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)
def add_swin_norm(self, value: bool) -> None:
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
def add_rescale_every_n_layers(self, count: int) -> None:
self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
def add_time_mix_extra_dim(self, dim: int) -> None:
self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim)
def add_time_decay_extra_dim(self, dim: int) -> None:
self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
def add_residual_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value)
def add_embedding_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value)
def add_wkv_head_size(self, size: int) -> None:
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
def add_token_shift_count(self, count: int) -> None:
self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count)
def add_interleave_moe_layer_step(self, value: int) -> None:
self.add_uint32(
Keys.LLM.INTERLEAVE_MOE_LAYER_STEP.format(arch=self.arch), value
)
def add_layer_norm_eps(self, value: float) -> None: def add_layer_norm_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
def add_layer_norm_rms_eps(self, value: float) -> None: def add_layer_norm_rms_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value) self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
def add_group_norm_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value)
def add_group_norm_groups(self, value: int) -> None:
self.add_uint32(Keys.Attention.GROUPNORM_GROUPS.format(arch=self.arch), value)
def add_causal_attention(self, value: bool) -> None: def add_causal_attention(self, value: bool) -> None:
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
@ -866,38 +764,18 @@ def add_q_lora_rank(self, length: int) -> None:
def add_kv_lora_rank(self, length: int) -> None: def add_kv_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length) self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
def add_decay_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.DECAY_LORA_RANK.format(arch=self.arch), length)
def add_iclr_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.ICLR_LORA_RANK.format(arch=self.arch), length)
def add_value_residual_mix_lora_rank(self, length: int) -> None:
self.add_uint32(
Keys.Attention.VALUE_RESIDUAL_MIX_LORA_RANK.format(arch=self.arch), length
)
def add_gate_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.GATE_LORA_RANK.format(arch=self.arch), length)
def add_relative_attn_buckets_count(self, value: int) -> None: def add_relative_attn_buckets_count(self, value: int) -> None:
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value) self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
def add_sliding_window(self, value: int) -> None: def add_sliding_window(self, value: int) -> None:
self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value) self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
def add_attention_scale(self, value: float) -> None:
self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)
def add_pooling_type(self, value: PoolingType) -> None: def add_pooling_type(self, value: PoolingType) -> None:
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
def add_rope_dimension_count(self, count: int) -> None: def add_rope_dimension_count(self, count: int) -> None:
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)
def add_rope_freq_base(self, value: float) -> None: def add_rope_freq_base(self, value: float) -> None:
self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value) self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
@ -931,9 +809,6 @@ def add_ssm_state_size(self, value: int) -> None:
def add_ssm_time_step_rank(self, value: int) -> None: def add_ssm_time_step_rank(self, value: int) -> None:
self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value) self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
def add_ssm_dt_b_c_rms(self, value: bool) -> None:
self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
def add_tokenizer_model(self, model: str) -> None: def add_tokenizer_model(self, model: str) -> None:
self.add_string(Keys.Tokenizer.MODEL, model) self.add_string(Keys.Tokenizer.MODEL, model)
@ -974,6 +849,9 @@ def add_sep_token_id(self, id: int) -> None:
def add_pad_token_id(self, id: int) -> None: def add_pad_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.PAD_ID, id) self.add_uint32(Keys.Tokenizer.PAD_ID, id)
def add_cls_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.CLS_ID, id)
def add_mask_token_id(self, id: int) -> None: def add_mask_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.MASK_ID, id) self.add_uint32(Keys.Tokenizer.MASK_ID, id)
@ -1025,65 +903,18 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
def add_prefix_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
def add_suffix_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
def add_middle_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
def add_eot_token_id(self, id: int) -> None: def add_eot_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.EOT_ID, id) self.add_uint32(Keys.Tokenizer.EOT_ID, id)
def add_eom_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.EOM_ID, id)
# for vision models
def add_vision_projection_dim(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)
def add_vision_has_vision_encoder(self, value: bool) -> None:
self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value)
def add_vision_patch_size(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)
def add_vision_embedding_length(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.EMBEDDING_LENGTH, value)
def add_vision_feed_forward_length(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.FEED_FORWARD_LENGTH, value)
def add_vision_block_count(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.BLOCK_COUNT, value)
def add_vision_head_count(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value)
def add_vision_projector_type(self, value: str) -> None:
self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
def add_vision_attention_layernorm_eps(self, value: float) -> None:
self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)
def add_vision_image_size(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
def add_vision_image_mean(self, values: Sequence[float]) -> None:
self.add_array(Keys.ClipVision.IMAGE_MEAN, values)
def add_vision_image_std(self, values: Sequence[float]) -> None:
self.add_array(Keys.ClipVision.IMAGE_STD, values)
def add_vision_spatial_merge_size(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.SPATIAL_MERGE_SIZE, value)
def add_vision_use_gelu(self, value: bool) -> None:
self.add_bool(Keys.ClipVision.USE_GELU, value)
def add_vision_use_silu(self, value: bool) -> None:
self.add_bool(Keys.ClipVision.USE_SILU, value)
def add_vision_projector_scale_factor(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
def add_vision_n_wa_pattern(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
pack_prefix = "" pack_prefix = ""
if not skip_pack_prefix: if not skip_pack_prefix:

View File

@ -12,7 +12,6 @@
class LazyMeta(ABCMeta): class LazyMeta(ABCMeta):
def __new__( def __new__(
cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs
): ):
@ -201,27 +200,6 @@ def wrapped_fn(*args, **kwargs):
return cls( return cls(
meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn
) )
elif isinstance(res, tuple) and all(
isinstance(t, cls._tensor_type) for t in res
):
# share the evaluation between lazy tuple elements
shared_args: list = [args, None]
def eager_tuple_element(a: list[Any], i: int = 0, /, **kw) -> LazyBase:
assert len(a) == 2
if a[1] is None:
a[1] = fn(*a[0], **kw)
return a[1][i]
return tuple(
cls(
meta=cls.eager_to_meta(res[i]),
args=(shared_args, i),
kwargs=kwargs,
func=eager_tuple_element,
)
for i in range(len(res))
)
else: else:
del res # not needed del res # not needed
# non-tensor return likely relies on the contents of the args # non-tensor return likely relies on the contents of the args
@ -276,8 +254,6 @@ def from_eager(cls, t: Any) -> Any:
class LazyNumpyTensor(LazyBase): class LazyNumpyTensor(LazyBase):
_tensor_type = np.ndarray _tensor_type = np.ndarray
shape: tuple[int, ...] # Makes the type checker happy in quants.py
@classmethod @classmethod
def meta_with_dtype_and_shape( def meta_with_dtype_and_shape(
cls, dtype: DTypeLike, shape: tuple[int, ...] cls, dtype: DTypeLike, shape: tuple[int, ...]

View File

@ -41,7 +41,7 @@ class Metadata:
base_models: Optional[list[dict]] = None base_models: Optional[list[dict]] = None
tags: Optional[list[str]] = None tags: Optional[list[str]] = None
languages: Optional[list[str]] = None languages: Optional[list[str]] = None
datasets: Optional[list[dict]] = None datasets: Optional[list[str]] = None
@staticmethod @staticmethod
def load( def load(
@ -126,13 +126,13 @@ def load(
"general.base_models", metadata.base_models "general.base_models", metadata.base_models
) )
# Datasets is received here as an array of datasets
metadata.datasets = metadata_override.get("general.datasets", metadata.datasets)
metadata.tags = metadata_override.get(Keys.General.TAGS, metadata.tags) metadata.tags = metadata_override.get(Keys.General.TAGS, metadata.tags)
metadata.languages = metadata_override.get( metadata.languages = metadata_override.get(
Keys.General.LANGUAGES, metadata.languages Keys.General.LANGUAGES, metadata.languages
) )
metadata.datasets = metadata_override.get(
Keys.General.DATASETS, metadata.datasets
)
# Direct Metadata Override (via direct cli argument) # Direct Metadata Override (via direct cli argument)
if model_name is not None: if model_name is not None:
@ -160,32 +160,12 @@ def load_model_card(model_path: Optional[Path] = None) -> dict[str, Any]:
if not model_card_path.is_file(): if not model_card_path.is_file():
return {} return {}
# The model card metadata is assumed to always be in YAML (frontmatter) # The model card metadata is assumed to always be in YAML
# ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473 # ref: https://github.com/huggingface/transformers/blob/a5c642fe7a1f25d3bdcd76991443ba6ff7ee34b2/src/transformers/modelcard.py#L468-L473
yaml_content: str = ""
with open(model_card_path, "r", encoding="utf-8") as f: with open(model_card_path, "r", encoding="utf-8") as f:
content = f.read() if f.readline() == "---\n":
lines = content.splitlines() raw = f.read().partition("---\n")[0]
lines_yaml = [] data = yaml.safe_load(raw)
if len(lines) == 0:
# Empty file
return {}
if len(lines) > 0 and lines[0] != "---":
# No frontmatter
return {}
for line in lines[1:]:
if line == "---":
break # End of frontmatter
else:
lines_yaml.append(line)
yaml_content = "\n".join(lines_yaml) + "\n"
# Quick hack to fix the Norway problem
# https://hitchdev.com/strictyaml/why/implicit-typing-removed/
yaml_content = yaml_content.replace("- no\n", '- "no"\n')
if yaml_content:
data = yaml.safe_load(yaml_content)
if isinstance(data, dict): if isinstance(data, dict):
return data return data
else: else:
@ -248,11 +228,7 @@ def get_model_id_components(
org_component, model_full_name_component = None, model_id org_component, model_full_name_component = None, model_id
# Check if we erroneously matched against './' or '../' etc... # Check if we erroneously matched against './' or '../' etc...
if ( if org_component is not None and org_component[0] == ".":
org_component is not None
and len(org_component) > 0
and org_component[0] == "."
):
org_component = None org_component = None
name_parts: list[str] = model_full_name_component.split("-") name_parts: list[str] = model_full_name_component.split("-")
@ -411,86 +387,27 @@ def apply_metadata_heuristic(
######################## ########################
if model_card is not None: if model_card is not None:
def use_model_card_metadata(metadata_key: str, model_card_key: str): if "model_name" in model_card and metadata.name is None:
if (
model_card_key in model_card
and getattr(metadata, metadata_key, None) is None
):
setattr(metadata, metadata_key, model_card.get(model_card_key))
def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
# Note: Will append rather than replace if already exist
tags_value = model_card.get(model_card_key, None)
if tags_value is None:
return
current_value = getattr(metadata, metadata_key, None)
if current_value is None:
current_value = []
if isinstance(tags_value, str):
current_value.append(tags_value)
elif isinstance(tags_value, list):
current_value.extend(tags_value)
setattr(metadata, metadata_key, current_value)
# LLAMA.cpp's direct internal convention
# (Definitely not part of hugging face formal/informal standard)
#########################################
use_model_card_metadata("name", "name")
use_model_card_metadata("author", "author")
use_model_card_metadata("version", "version")
use_model_card_metadata("organization", "organization")
use_model_card_metadata("description", "description")
use_model_card_metadata("finetune", "finetune")
use_model_card_metadata("basename", "basename")
use_model_card_metadata("size_label", "size_label")
use_model_card_metadata("source_url", "url")
use_model_card_metadata("source_doi", "doi")
use_model_card_metadata("source_uuid", "uuid")
use_model_card_metadata("source_repo_url", "repo_url")
# LLAMA.cpp's huggingface style convention
# (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style)
###########################################
use_model_card_metadata("name", "model_name")
use_model_card_metadata("author", "model_author")
use_model_card_metadata("version", "model_version")
use_model_card_metadata("organization", "model_organization")
use_model_card_metadata("description", "model_description")
use_model_card_metadata("finetune", "model_finetune")
use_model_card_metadata("basename", "model_basename")
use_model_card_metadata("size_label", "model_size_label")
use_model_card_metadata("source_url", "model_url")
use_model_card_metadata("source_doi", "model_doi")
use_model_card_metadata("source_uuid", "model_uuid")
use_model_card_metadata("source_repo_url", "model_repo_url")
# Hugging Face Direct Convention
#################################
# Not part of huggingface model card standard but notice some model creator using it # Not part of huggingface model card standard but notice some model creator using it
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
use_model_card_metadata("name", "model_name") metadata.name = model_card.get("model_name")
use_model_card_metadata("author", "model_creator")
use_model_card_metadata("basename", "model_type")
if ( if "model_creator" in model_card and metadata.author is None:
"base_model" in model_card # Not part of huggingface model card standard but notice some model creator using it
or "base_models" in model_card # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
or "base_model_sources" in model_card metadata.author = model_card.get("model_creator")
):
if "model_type" in model_card and metadata.basename is None:
# Not part of huggingface model card standard but notice some model creator using it
# such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
metadata.basename = model_card.get("model_type")
if "base_model" in model_card:
# This represents the parent models that this is based on # This represents the parent models that this is based on
# Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges) # Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges)
# Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md # Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md
metadata_base_models = [] metadata_base_models = []
base_model_value = model_card.get( base_model_value = model_card.get("base_model", None)
"base_model",
model_card.get(
"base_models", model_card.get("base_model_sources", None)
),
)
if base_model_value is not None: if base_model_value is not None:
if isinstance(base_model_value, str): if isinstance(base_model_value, str):
@ -503,47 +420,6 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
for model_id in metadata_base_models: for model_id in metadata_base_models:
# NOTE: model size of base model is assumed to be similar to the size of the current model # NOTE: model size of base model is assumed to be similar to the size of the current model
base_model = {}
if isinstance(model_id, str):
if (
model_id.startswith("http://")
or model_id.startswith("https://")
or model_id.startswith("ssh://")
):
base_model["repo_url"] = model_id
# Check if Hugging Face ID is present in URL
if "huggingface.co" in model_id:
match = re.match(
r"https?://huggingface.co/([^/]+/[^/]+)$", model_id
)
if match:
model_id_component = match.group(1)
(
model_full_name_component,
org_component,
basename,
finetune,
version,
size_label,
) = Metadata.get_model_id_components(
model_id_component, total_params
)
# Populate model dictionary with extracted components
if model_full_name_component is not None:
base_model["name"] = Metadata.id_to_title(
model_full_name_component
)
if org_component is not None:
base_model["organization"] = (
Metadata.id_to_title(org_component)
)
if version is not None:
base_model["version"] = version
else:
# Likely a Hugging Face ID
( (
model_full_name_component, model_full_name_component,
org_component, org_component,
@ -552,146 +428,78 @@ def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
version, version,
size_label, size_label,
) = Metadata.get_model_id_components(model_id, total_params) ) = Metadata.get_model_id_components(model_id, total_params)
base_model = {}
# Populate model dictionary with extracted components
if model_full_name_component is not None: if model_full_name_component is not None:
base_model["name"] = Metadata.id_to_title( base_model["name"] = Metadata.id_to_title(
model_full_name_component model_full_name_component
) )
if org_component is not None: if org_component is not None:
base_model["organization"] = Metadata.id_to_title( base_model["organization"] = Metadata.id_to_title(org_component)
org_component
)
if version is not None: if version is not None:
base_model["version"] = version base_model["version"] = version
if ( if (
org_component is not None org_component is not None
and model_full_name_component is not None and model_full_name_component is not None
): ):
base_model["repo_url"] = ( base_model[
f"https://huggingface.co/{org_component}/{model_full_name_component}" "repo_url"
) ] = f"https://huggingface.co/{org_component}/{model_full_name_component}"
elif isinstance(model_id, dict):
base_model = model_id
else:
logger.error(
f"base model entry '{str(model_id)}' not in a known format"
)
metadata.base_models.append(base_model) metadata.base_models.append(base_model)
if ( if "license" in model_card and metadata.license is None:
"datasets" in model_card metadata.license = model_card.get("license")
or "dataset" in model_card
or "dataset_sources" in model_card
):
# This represents the datasets that this was trained from
metadata_datasets = []
dataset_value = model_card.get(
"datasets",
model_card.get("dataset", model_card.get("dataset_sources", None)),
)
if "license_name" in model_card and metadata.license_name is None:
metadata.license_name = model_card.get("license_name")
if "license_link" in model_card and metadata.license_link is None:
metadata.license_link = model_card.get("license_link")
tags_value = model_card.get("tags", None)
if tags_value is not None:
if metadata.tags is None:
metadata.tags = []
if isinstance(tags_value, str):
metadata.tags.append(tags_value)
elif isinstance(tags_value, list):
metadata.tags.extend(tags_value)
pipeline_tags_value = model_card.get("pipeline_tag", None)
if pipeline_tags_value is not None:
if metadata.tags is None:
metadata.tags = []
if isinstance(pipeline_tags_value, str):
metadata.tags.append(pipeline_tags_value)
elif isinstance(pipeline_tags_value, list):
metadata.tags.extend(pipeline_tags_value)
language_value = model_card.get(
"languages", model_card.get("language", None)
)
if language_value is not None:
if metadata.languages is None:
metadata.languages = []
if isinstance(language_value, str):
metadata.languages.append(language_value)
elif isinstance(language_value, list):
metadata.languages.extend(language_value)
dataset_value = model_card.get("datasets", model_card.get("dataset", None))
if dataset_value is not None: if dataset_value is not None:
if isinstance(dataset_value, str):
metadata_datasets.append(dataset_value)
elif isinstance(dataset_value, list):
metadata_datasets.extend(dataset_value)
if metadata.datasets is None: if metadata.datasets is None:
metadata.datasets = [] metadata.datasets = []
for dataset_id in metadata_datasets: if isinstance(dataset_value, str):
# NOTE: model size of base model is assumed to be similar to the size of the current model metadata.datasets.append(dataset_value)
dataset = {} elif isinstance(dataset_value, list):
if isinstance(dataset_id, str): metadata.datasets.extend(dataset_value)
if dataset_id.startswith(("http://", "https://", "ssh://")):
dataset["repo_url"] = dataset_id
# Check if Hugging Face ID is present in URL
if "huggingface.co" in dataset_id:
match = re.match(
r"https?://huggingface.co/([^/]+/[^/]+)$",
dataset_id,
)
if match:
dataset_id_component = match.group(1)
(
dataset_name_component,
org_component,
basename,
finetune,
version,
size_label,
) = Metadata.get_model_id_components(
dataset_id_component, total_params
)
# Populate dataset dictionary with extracted components
if dataset_name_component is not None:
dataset["name"] = Metadata.id_to_title(
dataset_name_component
)
if org_component is not None:
dataset["organization"] = Metadata.id_to_title(
org_component
)
if version is not None:
dataset["version"] = version
else:
# Likely a Hugging Face ID
(
dataset_name_component,
org_component,
basename,
finetune,
version,
size_label,
) = Metadata.get_model_id_components(
dataset_id, total_params
)
# Populate dataset dictionary with extracted components
if dataset_name_component is not None:
dataset["name"] = Metadata.id_to_title(
dataset_name_component
)
if org_component is not None:
dataset["organization"] = Metadata.id_to_title(
org_component
)
if version is not None:
dataset["version"] = version
if (
org_component is not None
and dataset_name_component is not None
):
dataset["repo_url"] = (
f"https://huggingface.co/{org_component}/{dataset_name_component}"
)
elif isinstance(dataset_id, dict):
dataset = dataset_id
else:
logger.error(
f"dataset entry '{str(dataset_id)}' not in a known format"
)
metadata.datasets.append(dataset)
use_model_card_metadata("license", "license")
use_model_card_metadata("license_name", "license_name")
use_model_card_metadata("license_link", "license_link")
use_array_model_card_metadata("tags", "tags")
use_array_model_card_metadata("tags", "pipeline_tag")
use_array_model_card_metadata("languages", "languages")
use_array_model_card_metadata("languages", "language")
# Hugging Face Parameter Heuristics # Hugging Face Parameter Heuristics
#################################### ####################################
@ -776,9 +584,6 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
gguf_writer.add_size_label(self.size_label) gguf_writer.add_size_label(self.size_label)
if self.license is not None: if self.license is not None:
if isinstance(self.license, list):
gguf_writer.add_license(",".join(self.license))
else:
gguf_writer.add_license(self.license) gguf_writer.add_license(self.license)
if self.license_name is not None: if self.license_name is not None:
gguf_writer.add_license_name(self.license_name) gguf_writer.add_license_name(self.license_name)
@ -816,10 +621,6 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
gguf_writer.add_base_model_organization( gguf_writer.add_base_model_organization(
key, base_model_entry["organization"] key, base_model_entry["organization"]
) )
if "description" in base_model_entry:
gguf_writer.add_base_model_description(
key, base_model_entry["description"]
)
if "url" in base_model_entry: if "url" in base_model_entry:
gguf_writer.add_base_model_url(key, base_model_entry["url"]) gguf_writer.add_base_model_url(key, base_model_entry["url"])
if "doi" in base_model_entry: if "doi" in base_model_entry:
@ -831,33 +632,9 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
key, base_model_entry["repo_url"] key, base_model_entry["repo_url"]
) )
if self.datasets is not None:
gguf_writer.add_dataset_count(len(self.datasets))
for key, dataset_entry in enumerate(self.datasets):
if "name" in dataset_entry:
gguf_writer.add_dataset_name(key, dataset_entry["name"])
if "author" in dataset_entry:
gguf_writer.add_dataset_author(key, dataset_entry["author"])
if "version" in dataset_entry:
gguf_writer.add_dataset_version(key, dataset_entry["version"])
if "organization" in dataset_entry:
gguf_writer.add_dataset_organization(
key, dataset_entry["organization"]
)
if "description" in dataset_entry:
gguf_writer.add_dataset_description(
key, dataset_entry["description"]
)
if "url" in dataset_entry:
gguf_writer.add_dataset_url(key, dataset_entry["url"])
if "doi" in dataset_entry:
gguf_writer.add_dataset_doi(key, dataset_entry["doi"])
if "uuid" in dataset_entry:
gguf_writer.add_dataset_uuid(key, dataset_entry["uuid"])
if "repo_url" in dataset_entry:
gguf_writer.add_dataset_repo_url(key, dataset_entry["repo_url"])
if self.tags is not None: if self.tags is not None:
gguf_writer.add_tags(self.tags) gguf_writer.add_tags(self.tags)
if self.languages is not None: if self.languages is not None:
gguf_writer.add_languages(self.languages) gguf_writer.add_languages(self.languages)
if self.datasets is not None:
gguf_writer.add_datasets(self.datasets)

147
src/gguf-py/gguf/quants.py Normal file
View File

@ -0,0 +1,147 @@
from __future__ import annotations
from typing import Callable, Sequence
from numpy.typing import DTypeLike
from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
from .lazy import LazyNumpyTensor
import numpy as np
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
block_size, type_size = GGML_QUANT_SIZES[quant_type]
if shape[-1] % block_size != 0:
raise ValueError(
f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})"
)
return (*shape[:-1], shape[-1] // block_size * type_size)
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
block_size, type_size = GGML_QUANT_SIZES[quant_type]
if shape[-1] % type_size != 0:
raise ValueError(
f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})"
)
return (*shape[:-1], shape[-1] // type_size * block_size)
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
n = n.astype(np.float32, copy=False).view(np.uint32)
# force nan to quiet
n = np.where(
(n & 0x7FFFFFFF) > 0x7F800000,
(n & np.uint32(0xFFFF0000)) | np.uint32(64 << 16),
n,
)
# round to nearest even
n = (np.uint64(n) + (0x7FFF + ((n >> 16) & 1))) >> 16
return n.astype(np.uint16)
# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
def __apply_over_grouped_rows(
func: Callable[[np.ndarray], np.ndarray],
arr: np.ndarray,
otype: DTypeLike,
oshape: tuple[int, ...],
) -> np.ndarray:
rows = arr.reshape((-1, arr.shape[-1]))
osize = 1
for dim in oshape:
osize *= dim
out = np.empty(shape=osize, dtype=otype)
# compute over groups of 16 rows (arbitrary, but seems good for performance)
n_groups = (rows.shape[0] // 16) or 1
np.concatenate(
[func(group).ravel() for group in np.array_split(rows, n_groups)],
axis=0,
out=out,
)
return out.reshape(oshape)
def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
return __apply_over_grouped_rows(
__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape
)
__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(
__quantize_bf16_array, meta_noop=np.uint16
)
def quantize_bf16(n: np.ndarray):
if type(n) is LazyNumpyTensor:
return __quantize_bf16_lazy(n)
else:
return __quantize_bf16_array(n)
__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
def can_quantize_to_q8_0(n: np.ndarray) -> bool:
return n.shape[-1] % __q8_block_size == 0
# round away from zero
# ref: https://stackoverflow.com/a/59143326/22827863
def np_roundf(n: np.ndarray) -> np.ndarray:
a = abs(n)
floored = np.floor(a)
b = floored + np.floor(2 * (a - floored))
return np.sign(n) * b
def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
shape = n.shape
assert shape[-1] % __q8_block_size == 0
n_blocks = n.size // __q8_block_size
blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
d = abs(blocks).max(axis=1, keepdims=True) / 127
with np.errstate(divide="ignore"):
id = np.where(d == 0, 0, 1 / d)
qs = np_roundf(blocks * id)
# (n_blocks, 2)
d = d.astype(np.float16).view(np.uint8)
# (n_blocks, block_size)
qs = qs.astype(np.int8).view(np.uint8)
assert d.shape[1] + qs.shape[1] == __q8_type_size
return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
return __apply_over_grouped_rows(
__quantize_q8_0_rows,
arr=n,
otype=np.uint8,
oshape=__quantize_q8_0_shape_change(n.shape),
)
__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
__quantize_q8_0_array,
meta_noop=(np.uint8, __quantize_q8_0_shape_change),
)
def quantize_q8_0(data: np.ndarray):
if type(data) is LazyNumpyTensor:
return __quantize_q8_0_lazy(data)
else:
return __quantize_q8_0_array(data)

View File

@ -10,10 +10,10 @@ class TensorNameMap:
# Token embeddings # Token embeddings
MODEL_TENSOR.TOKEN_EMBD: ( MODEL_TENSOR.TOKEN_EMBD: (
"gpt_neox.embed_in", # gptneox "gpt_neox.embed_in", # gptneox
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais
"transformer.word_embeddings", # falcon "transformer.word_embeddings", # falcon
"word_embeddings", # bloom "word_embeddings", # bloom
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 "model.embed_tokens", # llama-hf
"tok_embeddings", # llama-pth "tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert nomic-bert "embeddings.word_embeddings", # bert nomic-bert
"language_model.embedding.word_embeddings", # persimmon "language_model.embedding.word_embeddings", # persimmon
@ -27,10 +27,6 @@ class TensorNameMap:
"embedding.word_embeddings", # chatglm "embedding.word_embeddings", # chatglm
"transformer.token_embeddings", # openelm "transformer.token_embeddings", # openelm
"shared", # t5 "shared", # t5
"rwkv.embeddings", # rwkv6
"model.embeddings", # rwkv7
"model.word_embeddings", # bailingmoe
"language_model.model.embed_tokens", # llama4
), ),
# Token type embeddings # Token type embeddings
MODEL_TENSOR.TOKEN_TYPES: ( MODEL_TENSOR.TOKEN_TYPES: (
@ -42,11 +38,6 @@ class TensorNameMap:
"embeddings.LayerNorm", # bert "embeddings.LayerNorm", # bert
"emb_ln", # nomic-bert "emb_ln", # nomic-bert
"transformer.norm", # openelm "transformer.norm", # openelm
"rwkv.blocks.0.pre_ln", # rwkv
"rwkv.blocks.0.pre_ln", # rwkv6
"model.pre_ln", # rwkv7
"model.layers.0.pre_norm", # rwkv7
"backbone.norm", # wavtokenizer
), ),
# Position embeddings # Position embeddings
MODEL_TENSOR.POS_EMBD: ( MODEL_TENSOR.POS_EMBD: (
@ -57,20 +48,17 @@ class TensorNameMap:
# Output # Output
MODEL_TENSOR.OUTPUT: ( MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox "embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais
"output", # llama-pth bloom internlm2 "output", # llama-pth bloom internlm2
"word_embeddings_for_head", # persimmon "word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2 "lm_head.linear", # phi2
"output_layer", # chatglm "output_layer", # chatglm
"head", # rwkv
"head.out", # wavtokenizer
"lm_head", # llama4
), ),
# Output norm # Output norm
MODEL_TENSOR.OUTPUT_NORM: ( MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox "gpt_neox.final_layer_norm", # gptneox
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone "transformer.ln_f", # gpt2 gpt-j falcon jais
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe "model.norm", # llama-hf baichuan internlm2
"norm", # llama-pth "norm", # llama-pth
"transformer.norm_f", # mpt dbrx "transformer.norm_f", # mpt dbrx
"ln_f", # refact bloom qwen gpt2 "ln_f", # refact bloom qwen gpt2
@ -82,32 +70,24 @@ class TensorNameMap:
"transformer.rms_norm", # Grok "transformer.rms_norm", # Grok
"encoder.final_layernorm", # chatglm "encoder.final_layernorm", # chatglm
"transformer.norm", # openelm "transformer.norm", # openelm
"model.norm", # nemotron
"rwkv.ln_out", # rwkv6
"model.ln_out", # rwkv7
"backbone.final_layer_norm", # wavtokenizer
"model.norm", # llama4
), ),
# Rope frequencies # Rope frequencies
MODEL_TENSOR.ROPE_FREQS: ( MODEL_TENSOR.ROPE_FREQS: (
"rope.freqs", # llama-pth "rope.freqs", # llama-pth
"rotary_pos_emb.inv_freq", # chatglm "rotary_pos_emb.inv_freq", # chatglm
), ),
MODEL_TENSOR.ROPE_FACTORS_LONG: (),
MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
MODEL_TENSOR.CONV1D: ("backbone.embed",), # roberta
} }
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
# Attention norm # Attention norm
MODEL_TENSOR.ATTN_NORM: ( MODEL_TENSOR.ATTN_NORM: (
"gpt_neox.layers.{bid}.input_layernorm", # gptneox "gpt_neox.layers.{bid}.input_layernorm", # gptneox
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais exaone "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais
"transformer.blocks.{bid}.norm_1", # mpt "transformer.blocks.{bid}.norm_1", # mpt
"transformer.h.{bid}.input_layernorm", # falcon7b "transformer.h.{bid}.input_layernorm", # falcon7b
"h.{bid}.input_layernorm", # bloom "h.{bid}.input_layernorm", # bloom
"transformer.h.{bid}.ln_mlp", # falcon40b "transformer.h.{bid}.ln_mlp", # falcon40b
"model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe phimoe "model.layers.{bid}.input_layernorm", # llama-hf
"layers.{bid}.attention_norm", # llama-pth "layers.{bid}.attention_norm", # llama-pth
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
"model.layers.{bid}.ln1", # yi "model.layers.{bid}.ln1", # yi
@ -121,16 +101,11 @@ class TensorNameMap:
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
"encoder.layers.{bid}.input_layernorm", # chatglm "encoder.layers.{bid}.input_layernorm", # chatglm
"transformer.layers.{bid}.attn_norm", # openelm "transformer.layers.{bid}.attn_norm", # openelm
"rwkv.blocks.{bid}.ln1", # rwkv6
"model.layers.{bid}.ln1", # rwkv7
"model.layers.{bid}.input_layernorm", # llama4
), ),
# Attention norm 2 # Attention norm 2
MODEL_TENSOR.ATTN_NORM_2: ( MODEL_TENSOR.ATTN_NORM_2: (
"transformer.h.{bid}.ln_attn", # falcon40b "transformer.h.{bid}.ln_attn", # falcon40b
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code "encoder.layer.{bid}.layer_norm_1", # jina-v2-code
"rwkv.blocks.{bid}.ln2", # rwkv6
"model.layers.{bid}.ln2", # rwkv7
), ),
# Attention query-key-value # Attention query-key-value
MODEL_TENSOR.ATTN_QKV: ( MODEL_TENSOR.ATTN_QKV: (
@ -151,21 +126,17 @@ class TensorNameMap:
), ),
# Attention query # Attention query
MODEL_TENSOR.ATTN_Q: ( MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 phimoe "model.layers.{bid}.self_attn.q_proj", # llama-hf
"model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
"layers.{bid}.attention.wq", # llama-pth "layers.{bid}.attention.wq", # llama-pth
"encoder.layer.{bid}.attention.self.query", # bert "encoder.layer.{bid}.attention.self.query", # bert
"transformer.h.{bid}.attn.q_proj", # gpt-j "transformer.h.{bid}.attn.q_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.q_proj", # plamo "model.layers.layers.{bid}.self_attn.q_proj", # plamo
"model.layers.{bid}.attention.wq", # internlm2 "model.layers.{bid}.attention.wq", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok "transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok
"transformer.h.{bid}.attn.attention.q_proj", # exaone
"model.layers.{bid}.self_attn.q_proj", # llama4
), ),
# Attention key # Attention key
MODEL_TENSOR.ATTN_K: ( MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 phimoe "model.layers.{bid}.self_attn.k_proj", # llama-hf
"model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
"layers.{bid}.attention.wk", # llama-pth "layers.{bid}.attention.wk", # llama-pth
"encoder.layer.{bid}.attention.self.key", # bert "encoder.layer.{bid}.attention.self.key", # bert
"transformer.h.{bid}.attn.k_proj", # gpt-j "transformer.h.{bid}.attn.k_proj", # gpt-j
@ -173,12 +144,10 @@ class TensorNameMap:
"model.layers.layers.{bid}.self_attn.k_proj", # plamo "model.layers.layers.{bid}.self_attn.k_proj", # plamo
"model.layers.{bid}.attention.wk", # internlm2 "model.layers.{bid}.attention.wk", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok "transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok
"transformer.h.{bid}.attn.attention.k_proj", # exaone
"model.layers.{bid}.self_attn.k_proj", # llama4
), ),
# Attention value # Attention value
MODEL_TENSOR.ATTN_V: ( MODEL_TENSOR.ATTN_V: (
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe "model.layers.{bid}.self_attn.v_proj", # llama-hf
"layers.{bid}.attention.wv", # llama-pth "layers.{bid}.attention.wv", # llama-pth
"encoder.layer.{bid}.attention.self.value", # bert "encoder.layer.{bid}.attention.self.value", # bert
"transformer.h.{bid}.attn.v_proj", # gpt-j "transformer.h.{bid}.attn.v_proj", # gpt-j
@ -186,8 +155,6 @@ class TensorNameMap:
"model.layers.layers.{bid}.self_attn.v_proj", # plamo "model.layers.layers.{bid}.self_attn.v_proj", # plamo
"model.layers.{bid}.attention.wv", # internlm2 "model.layers.{bid}.attention.wv", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok "transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok
"transformer.h.{bid}.attn.attention.v_proj", # exaone
"model.layers.{bid}.self_attn.v_proj", # llama4
), ),
# Attention output # Attention output
MODEL_TENSOR.ATTN_OUT: ( MODEL_TENSOR.ATTN_OUT: (
@ -196,8 +163,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.blocks.{bid}.attn.out_proj", # mpt
"transformer.h.{bid}.self_attention.dense", # falcon "transformer.h.{bid}.self_attention.dense", # falcon
"h.{bid}.self_attention.dense", # bloom "h.{bid}.self_attention.dense", # bloom
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe "model.layers.{bid}.self_attn.o_proj", # llama-hf
"model.layers.{bid}.self_attn.linear_attn", # deci
"layers.{bid}.attention.wo", # llama-pth "layers.{bid}.attention.wo", # llama-pth
"encoder.layer.{bid}.attention.output.dense", # bert "encoder.layer.{bid}.attention.output.dense", # bert
"transformer.h.{bid}.attn.out_proj", # gpt-j "transformer.h.{bid}.attn.out_proj", # gpt-j
@ -212,8 +178,6 @@ class TensorNameMap:
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
"encoder.layers.{bid}.self_attention.dense", # chatglm "encoder.layers.{bid}.self_attention.dense", # chatglm
"transformer.layers.{bid}.attn.out_proj", # openelm "transformer.layers.{bid}.attn.out_proj", # openelm
"transformer.h.{bid}.attn.attention.out_proj", # exaone
"model.layers.{bid}.self_attn.o_proj", # llama4
), ),
# Attention output norm # Attention output norm
MODEL_TENSOR.ATTN_OUT_NORM: ( MODEL_TENSOR.ATTN_OUT_NORM: (
@ -223,8 +187,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
), ),
MODEL_TENSOR.ATTN_POST_NORM: ( MODEL_TENSOR.ATTN_POST_NORM: (
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge "model.layers.{bid}.post_attention_layernorm", # gemma2
"model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
), ),
# Rotary embeddings # Rotary embeddings
MODEL_TENSOR.ATTN_ROT_EMBD: ( MODEL_TENSOR.ATTN_ROT_EMBD: (
@ -236,10 +199,10 @@ class TensorNameMap:
# Feed-forward norm # Feed-forward norm
MODEL_TENSOR.FFN_NORM: ( MODEL_TENSOR.FFN_NORM: (
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais
"h.{bid}.post_attention_layernorm", # bloom "h.{bid}.post_attention_layernorm", # bloom
"transformer.blocks.{bid}.norm_2", # mpt "transformer.blocks.{bid}.norm_2", # mpt
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe phimoe "model.layers.{bid}.post_attention_layernorm", # llama-hf
"layers.{bid}.ffn_norm", # llama-pth "layers.{bid}.ffn_norm", # llama-pth
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
"model.layers.{bid}.ln2", # yi "model.layers.{bid}.ln2", # yi
@ -248,7 +211,6 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok "transformer.decoder_layer.{bid}.rms_norm_2", # Grok
"encoder.layers.{bid}.post_attention_layernorm", # chatglm "encoder.layers.{bid}.post_attention_layernorm", # chatglm
"transformer.layers.{bid}.ffn_norm", # openelm "transformer.layers.{bid}.ffn_norm", # openelm
"model.layers.{bid}.post_attention_layernorm", # llama4
), ),
# Post feed-forward norm # Post feed-forward norm
MODEL_TENSOR.FFN_PRE_NORM: ( MODEL_TENSOR.FFN_PRE_NORM: (
@ -256,25 +218,18 @@ class TensorNameMap:
), ),
# Post feed-forward norm # Post feed-forward norm
MODEL_TENSOR.FFN_POST_NORM: ( MODEL_TENSOR.FFN_POST_NORM: (
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 "model.layers.{bid}.post_feedforward_layernorm", # gemma2
"model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
), ),
MODEL_TENSOR.FFN_GATE_INP: ( MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral "layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral phimoe "model.layers.{bid}.block_sparse_moe.gate", # mixtral
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe "model.layers.{bid}.mlp.gate", # qwen2moe
"transformer.decoder_layer.{bid}.router", # Grok "transformer.decoder_layer.{bid}.router", # Grok
"transformer.blocks.{bid}.ffn.router.layer", # dbrx "transformer.blocks.{bid}.ffn.router.layer", # dbrx
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
"model.layers.{bid}.feed_forward.router", # llama4
"encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
), ),
MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
), ),
MODEL_TENSOR.FFN_EXP_PROBS_B: (
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
),
# Feed-forward up # Feed-forward up
MODEL_TENSOR.FFN_UP: ( MODEL_TENSOR.FFN_UP: (
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
@ -282,7 +237,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.blocks.{bid}.ffn.up_proj", # mpt
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
"h.{bid}.mlp.dense_h_to_4h", # bloom "h.{bid}.mlp.dense_h_to_4h", # bloom
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2 "model.layers.{bid}.mlp.up_proj", # llama-hf refact
"layers.{bid}.feed_forward.w3", # llama-pth "layers.{bid}.feed_forward.w3", # llama-pth
"encoder.layer.{bid}.intermediate.dense", # bert "encoder.layer.{bid}.intermediate.dense", # bert
"transformer.h.{bid}.mlp.fc_in", # gpt-j "transformer.h.{bid}.mlp.fc_in", # gpt-j
@ -293,37 +248,30 @@ class TensorNameMap:
"h.{bid}.mlp.c_fc", # gpt2 "h.{bid}.mlp.c_fc", # gpt2
"transformer.h.{bid}.mlp.fc1", # phi2 "transformer.h.{bid}.mlp.fc1", # phi2
"model.layers.{bid}.mlp.fc1", # phi2 "model.layers.{bid}.mlp.fc1", # phi2
"model.layers.{bid}.mlp.gate_up_proj", # phi3 glm-4-0414 "model.layers.{bid}.mlp.gate_up_proj", # phi3
"model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.layers.{bid}.mlp.up_proj", # plamo
"model.layers.{bid}.feed_forward.w3", # internlm2 "model.layers.{bid}.feed_forward.w3", # internlm2
"encoder.layers.{bid}.mlp.fc11", # nomic-bert "encoder.layers.{bid}.mlp.fc11", # nomic-bert
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
"model.layers.{bid}.mlp.c_fc", # starcoder2 "model.layers.{bid}.mlp.c_fc", # starcoder2
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
"model.layers.{bid}.residual_mlp.w3", # arctic "model.layers.{bid}.residual_mlp.w3", # arctic
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
"transformer.h.{bid}.mlp.c_fc_1", # exaone
"model.layers.{bid}.feed_forward.up_proj", # llama4
), ),
MODEL_TENSOR.FFN_UP_EXP: ( MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged)
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
), ),
MODEL_TENSOR.FFN_UP_SHEXP: ( MODEL_TENSOR.FFN_UP_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2 "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2
"model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
), ),
# AWQ-activation gate # AWQ-activation gate
MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt
# Feed-forward gate # Feed-forward gate
MODEL_TENSOR.FFN_GATE: ( MODEL_TENSOR.FFN_GATE: (
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 "model.layers.{bid}.mlp.gate_proj", # llama-hf refact
"layers.{bid}.feed_forward.w1", # llama-pth "layers.{bid}.feed_forward.w1", # llama-pth
"transformer.h.{bid}.mlp.w2", # qwen "transformer.h.{bid}.mlp.w2", # qwen
"transformer.h.{bid}.mlp.c_fc2", # jais "transformer.h.{bid}.mlp.c_fc2", # jais
@ -333,21 +281,16 @@ class TensorNameMap:
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
"transformer.h.{bid}.mlp.linear_1", # refact "transformer.h.{bid}.mlp.linear_1", # refact
"model.layers.{bid}.residual_mlp.w1", # arctic "model.layers.{bid}.residual_mlp.w1", # arctic
"transformer.h.{bid}.mlp.c_fc_0", # exaone
"model.layers.{bid}.feed_forward.gate_proj", # llama4
), ),
MODEL_TENSOR.FFN_GATE_EXP: ( MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged)
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
"model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
), ),
MODEL_TENSOR.FFN_GATE_SHEXP: ( MODEL_TENSOR.FFN_GATE_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2 "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2
"model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
), ),
# Feed-forward down # Feed-forward down
MODEL_TENSOR.FFN_DOWN: ( MODEL_TENSOR.FFN_DOWN: (
@ -356,7 +299,7 @@ class TensorNameMap:
"transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.blocks.{bid}.ffn.down_proj", # mpt
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
"h.{bid}.mlp.dense_4h_to_h", # bloom "h.{bid}.mlp.dense_4h_to_h", # bloom
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2 "model.layers.{bid}.mlp.down_proj", # llama-hf
"layers.{bid}.feed_forward.w2", # llama-pth "layers.{bid}.feed_forward.w2", # llama-pth
"encoder.layer.{bid}.output.dense", # bert "encoder.layer.{bid}.output.dense", # bert
"transformer.h.{bid}.mlp.fc_out", # gpt-j "transformer.h.{bid}.mlp.fc_out", # gpt-j
@ -374,29 +317,21 @@ class TensorNameMap:
"model.layers.{bid}.residual_mlp.w2", # arctic "model.layers.{bid}.residual_mlp.w2", # arctic
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2 "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
"model.layers.h.{bid}.mlp.c_proj", # exaone
"model.layers.{bid}.feed_forward.down_proj", # llama4
), ),
MODEL_TENSOR.FFN_DOWN_EXP: ( MODEL_TENSOR.FFN_DOWN_EXP: (
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged)
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
), ),
MODEL_TENSOR.FFN_DOWN_SHEXP: ( MODEL_TENSOR.FFN_DOWN_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2 "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2
"model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
), ),
MODEL_TENSOR.ATTN_Q_NORM: ( MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm", "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", # persimmon "model.layers.{bid}.self_attn.q_layernorm", # persimmon
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2 "model.layers.{bid}.self_attn.q_norm", # cohere
"transformer.blocks.{bid}.attn.q_ln", # sea-lion "transformer.blocks.{bid}.attn.q_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
"transformer.layers.{bid}.attn.q_norm", # openelm "transformer.layers.{bid}.attn.q_norm", # openelm
@ -404,7 +339,7 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_K_NORM: ( MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm", "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon "model.layers.{bid}.self_attn.k_layernorm", # persimmon
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2 "model.layers.{bid}.self_attn.k_norm", # cohere
"transformer.blocks.{bid}.attn.k_ln", # sea-lion "transformer.blocks.{bid}.attn.k_ln", # sea-lion
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
"transformer.layers.{bid}.attn.k_norm", # openelm "transformer.layers.{bid}.attn.k_norm", # openelm
@ -447,117 +382,6 @@ class TensorNameMap:
"model.layers.{bid}.out_proj", "model.layers.{bid}.out_proj",
"backbone.layers.{bid}.mixer.out_proj", "backbone.layers.{bid}.mixer.out_proj",
), ),
MODEL_TENSOR.TIME_MIX_W0: ("model.layers.{bid}.attention.w0",), # rwkv7
MODEL_TENSOR.TIME_MIX_W1: (
"rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv6
"model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2
"model.layers.{bid}.attention.w1", # rwkv7
),
MODEL_TENSOR.TIME_MIX_W2: (
"rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv6
"model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2
"model.layers.{bid}.attention.w2", # rwkv7
),
MODEL_TENSOR.TIME_MIX_A0: ("model.layers.{bid}.attention.a0",), # rwkv7
MODEL_TENSOR.TIME_MIX_A1: ("model.layers.{bid}.attention.a1",), # rwkv7
MODEL_TENSOR.TIME_MIX_A2: ("model.layers.{bid}.attention.a2",), # rwkv7
MODEL_TENSOR.TIME_MIX_V0: ("model.layers.{bid}.attention.v0",), # rwkv7
MODEL_TENSOR.TIME_MIX_V1: ("model.layers.{bid}.attention.v1",), # rwkv7
MODEL_TENSOR.TIME_MIX_V2: ("model.layers.{bid}.attention.v2",), # rwkv7
MODEL_TENSOR.TIME_MIX_G1: ("model.layers.{bid}.attention.g1",), # rwkv7
MODEL_TENSOR.TIME_MIX_G2: ("model.layers.{bid}.attention.g2",), # rwkv7
MODEL_TENSOR.TIME_MIX_K_K: ("model.layers.{bid}.attention.k_k",), # rwkv7
MODEL_TENSOR.TIME_MIX_K_A: ("model.layers.{bid}.attention.k_a",), # rwkv7
MODEL_TENSOR.TIME_MIX_R_K: ("model.layers.{bid}.attention.r_k",), # rwkv7
MODEL_TENSOR.TIME_MIX_LERP_X: (
"rwkv.blocks.{bid}.attention.time_maa_x", # rwkv6
"model.layers.{bid}.self_attn.time_maa_x", # rwkv6qwen2
),
MODEL_TENSOR.TIME_MIX_LERP_K: (
"rwkv.blocks.{bid}.attention.time_maa_k", # rwkv6
"model.layers.{bid}.self_attn.time_maa_k", # rwkv6qwen2
),
MODEL_TENSOR.TIME_MIX_LERP_V: (
"rwkv.blocks.{bid}.attention.time_maa_v", # rwkv6
"model.layers.{bid}.self_attn.time_maa_v", # rwkv6qwen2
),
MODEL_TENSOR.TIME_MIX_LERP_R: (
"rwkv.blocks.{bid}.attention.time_maa_r", # rwkv6
"model.layers.{bid}.self_attn.time_maa_r", # rwkv6qwen2
),
MODEL_TENSOR.TIME_MIX_LERP_G: (
"rwkv.blocks.{bid}.attention.time_maa_g", # rwkv6
"model.layers.{bid}.self_attn.time_maa_g", # rwkv6qwen2
),
MODEL_TENSOR.TIME_MIX_LERP_W: (
"rwkv.blocks.{bid}.attention.time_maa_w", # rwkv6
"model.layers.{bid}.self_attn.time_maa_w", # rwkv6qwen2
),
MODEL_TENSOR.TIME_MIX_FIRST: (
"rwkv.blocks.{bid}.attention.time_faaaa", # rwkv6
),
MODEL_TENSOR.TIME_MIX_DECAY: (
"rwkv.blocks.{bid}.attention.time_decay", # rwkv6
"model.layers.{bid}.self_attn.time_decay", # rwkv6qwen2
),
MODEL_TENSOR.TIME_MIX_DECAY_W1: (
"rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv6
"model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2
),
MODEL_TENSOR.TIME_MIX_DECAY_W2: (
"rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv6
"model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2
),
MODEL_TENSOR.TIME_MIX_KEY: (
"rwkv.blocks.{bid}.attention.key", # rwkv6
"model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2
"model.layers.{bid}.attention.key", # rwkv7
"model.layers.{bid}.attention.k_proj", # rwkv7
),
MODEL_TENSOR.TIME_MIX_VALUE: (
"rwkv.blocks.{bid}.attention.value", # rwkv6
"model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2
"model.layers.{bid}.attention.value", # rwkv7
"model.layers.{bid}.attention.v_proj", # rwkv7
),
MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
"rwkv.blocks.{bid}.attention.receptance", # rwkv6
"model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2
"model.layers.{bid}.attention.receptance", # rwkv7
"model.layers.{bid}.attention.r_proj", # rwkv7
),
MODEL_TENSOR.TIME_MIX_GATE: (
"rwkv.blocks.{bid}.attention.gate", # rwkv6
"model.layers.{bid}.self_attn.gate", # rwkv6qwen2
),
MODEL_TENSOR.TIME_MIX_LN: (
"rwkv.blocks.{bid}.attention.ln_x", # rwkv6
"model.layers.{bid}.attention.ln_x", # rwkv7
),
MODEL_TENSOR.TIME_MIX_OUTPUT: (
"rwkv.blocks.{bid}.attention.output", # rwkv6
"model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2
"model.layers.{bid}.attention.output", # rwkv7
"model.layers.{bid}.attention.o_proj", # rwkv7
),
MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
"rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv6
"model.layers.{bid}.feed_forward.x_k", # rwkv7
),
MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
"rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv6
),
MODEL_TENSOR.CHANNEL_MIX_KEY: (
"rwkv.blocks.{bid}.feed_forward.key", # rwkv6
"model.layers.{bid}.feed_forward.key", # rwkv7
),
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
"rwkv.blocks.{bid}.feed_forward.receptance", # rwkv6
),
MODEL_TENSOR.CHANNEL_MIX_VALUE: (
"rwkv.blocks.{bid}.feed_forward.value", # rwkv6
"model.layers.{bid}.feed_forward.value", # rwkv7
),
MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), # deepseek2 MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), # deepseek2
MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), # deepseek2 MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), # deepseek2
MODEL_TENSOR.ATTN_KV_A_MQA: ( MODEL_TENSOR.ATTN_KV_A_MQA: (
@ -566,8 +390,6 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_KV_B: ( MODEL_TENSOR.ATTN_KV_B: (
"model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
), ),
MODEL_TENSOR.ATTN_K_B: ("model.layers.{bid}.self_attn.k_b_proj",), # deepseek2
MODEL_TENSOR.ATTN_V_B: ("model.layers.{bid}.self_attn.v_b_proj",), # deepseek2
MODEL_TENSOR.ATTN_Q_A_NORM: ( MODEL_TENSOR.ATTN_Q_A_NORM: (
"model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2 "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
), ),
@ -639,173 +461,7 @@ class TensorNameMap:
MODEL_TENSOR.ENC_FFN_DOWN: ( MODEL_TENSOR.ENC_FFN_DOWN: (
"encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5 "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
), ),
############################################################################
# TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
MODEL_TENSOR.ENC_OUTPUT_NORM: ("encoder.final_layer_norm",), # t5 MODEL_TENSOR.ENC_OUTPUT_NORM: ("encoder.final_layer_norm",), # t5
MODEL_TENSOR.CLS: (
"classifier", # jina
"classifier.dense", # roberta
),
MODEL_TENSOR.CLS_OUT: ("classifier.out_proj",), # roberta
#############################################################################
MODEL_TENSOR.CONVNEXT_DW: ("backbone.convnext.{bid}.dwconv",), # wavtokenizer
MODEL_TENSOR.CONVNEXT_NORM: ("backbone.convnext.{bid}.norm",), # wavtokenizer
MODEL_TENSOR.CONVNEXT_PW1: ("backbone.convnext.{bid}.pwconv1",), # wavtokenizer
MODEL_TENSOR.CONVNEXT_PW2: ("backbone.convnext.{bid}.pwconv2",), # wavtokenizer
MODEL_TENSOR.CONVNEXT_GAMMA: ("backbone.convnext.{bid}.gamma",), # wavtokenizer
MODEL_TENSOR.POSNET_CONV1: ("backbone.posnet.{bid}.conv1",), # wavtokenizer
MODEL_TENSOR.POSNET_CONV2: ("backbone.posnet.{bid}.conv2",), # wavtokenizer
MODEL_TENSOR.POSNET_NORM: ("backbone.posnet.{bid}.norm",), # wavtokenizer
MODEL_TENSOR.POSNET_NORM1: ("backbone.posnet.{bid}.norm1",), # wavtokenizer
MODEL_TENSOR.POSNET_NORM2: ("backbone.posnet.{bid}.norm2",), # wavtokenizer
MODEL_TENSOR.POSNET_ATTN_NORM: ("backbone.posnet.{bid}.norm",), # wavtokenizer
MODEL_TENSOR.POSNET_ATTN_Q: ("backbone.posnet.{bid}.q",), # wavtokenizer
MODEL_TENSOR.POSNET_ATTN_K: ("backbone.posnet.{bid}.k",), # wavtokenizer
MODEL_TENSOR.POSNET_ATTN_V: ("backbone.posnet.{bid}.v",), # wavtokenizer
MODEL_TENSOR.POSNET_ATTN_OUT: (
"backbone.posnet.{bid}.proj_out", # wavtokenizer
),
#############################################################################
## Vision encoder
MODEL_TENSOR.V_MMPROJ: (
"multi_modal_projector.linear_{bid}",
"visual.merger.mlp.{bid}", # qwen2vl
),
MODEL_TENSOR.V_MMPROJ_FC: (
"model.connector.modality_projection.proj", # SmolVLM
),
MODEL_TENSOR.V_MMPROJ_MLP: (
"model.mm_projector.mlp.mlp.{bid}",
"mlp1.{bid}", # InternVL
),
MODEL_TENSOR.V_MMPROJ_PEG: ("model.mm_projector.peg.peg.{bid}",),
MODEL_TENSOR.V_ENC_EMBD_CLS: (
"vision_tower.vision_model.embeddings.class_embedding",
),
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
"vision_tower.vision_model.embeddings.patch_embedding",
"vpm.embeddings.patch_embedding",
"model.vision_model.embeddings.patch_embedding", # SmolVLM
"vision_tower.patch_conv", # pixtral
"visual.patch_embed.proj", # qwen2vl
),
MODEL_TENSOR.V_ENC_EMBD_POS: (
"vision_tower.vision_model.embeddings.position_embedding",
"vpm.embeddings.position_embedding",
"model.vision_model.embeddings.position_embedding", # SmolVLM
),
MODEL_TENSOR.V_ENC_ATTN_Q: (
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
"vpm.encoder.layers.{bid}.self_attn.q_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
),
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
),
MODEL_TENSOR.V_ENC_ATTN_K: (
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
"vpm.encoder.layers.{bid}.self_attn.k_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
),
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
),
MODEL_TENSOR.V_ENC_ATTN_V: (
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
"vpm.encoder.layers.{bid}.self_attn.v_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
),
MODEL_TENSOR.V_ENC_INPUT_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
"vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
"vpm.encoder.layers.{bid}.layer_norm1",
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
"visual.blocks.{bid}.norm1", # qwen2vl
),
MODEL_TENSOR.V_ENC_OUTPUT: (
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
"vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
"vpm.encoder.layers.{bid}.self_attn.out_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
"visual.blocks.{bid}.attn.proj", # qwen2vl
),
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
"vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
"vpm.encoder.layers.{bid}.layer_norm2",
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
"visual.blocks.{bid}.norm2", # qwen2vl
),
MODEL_TENSOR.V_ENC_FFN_UP: (
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
"vpm.encoder.layers.{bid}.mlp.fc1",
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
"visual.blocks.{bid}.mlp.fc1", # qwen2vl
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
),
MODEL_TENSOR.V_ENC_FFN_GATE: (
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
),
MODEL_TENSOR.V_ENC_FFN_DOWN: (
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
"vpm.encoder.layers.{bid}.mlp.fc2",
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
"visual.blocks.{bid}.mlp.fc2", # qwen2vl
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
),
MODEL_TENSOR.V_LAYER_SCALE_1: (
"vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
),
MODEL_TENSOR.V_LAYER_SCALE_2: (
"vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
),
MODEL_TENSOR.V_PRE_NORM: (
"vision_tower.vision_model.pre_layrnorm",
"vision_tower.ln_pre", # pixtral
),
MODEL_TENSOR.V_POST_NORM: (
"vision_tower.vision_model.post_layernorm",
"model.vision_model.post_layernorm", # SmolVLM
"visual.merger.ln_q", # qwen2vl
),
MODEL_TENSOR.V_MM_INP_PROJ: ("multi_modal_projector.mm_input_projection",),
MODEL_TENSOR.V_MM_INP_NORM: ("multi_modal_projector.norm",),
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ("multi_modal_projector.mm_soft_emb_norm",),
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ("resampler.pos_embed_k",),
MODEL_TENSOR.V_RESMPL_ATTN_Q: (
"resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj
),
MODEL_TENSOR.V_RESMPL_ATTN_K: (
"resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj
),
MODEL_TENSOR.V_RESMPL_ATTN_V: (
"resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj
),
MODEL_TENSOR.V_RESMPL_ATTN_OUT: ("resampler.attn.out_proj",),
MODEL_TENSOR.V_RESMPL_KV: ("resampler.kv_proj",),
MODEL_TENSOR.V_RESMPL_POST_NORM: ("resampler.ln_post",),
MODEL_TENSOR.V_RESMPL_KV_NORM: ("resampler.ln_kv",),
MODEL_TENSOR.V_RESMPL_Q_NORM: ("resampler.ln_q",),
MODEL_TENSOR.V_RESMPL_PROJ: ("resampler.proj",),
MODEL_TENSOR.V_RESMPL_QUERY: ("resampler.query",),
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: (
"v.token_embd.img_break", # for pixtral, this is a generated vector
),
MODEL_TENSOR.V_MM_PATCH_MERGER: (
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
),
} }
# architecture-specific block mappings # architecture-specific block mappings

101
src/gguf-py/gguf/utility.py Normal file
View File

@ -0,0 +1,101 @@
from __future__ import annotations
from typing import Literal
def fill_templated_filename(filename: str, output_type: str | None) -> str:
# Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf'
ftype_lowercase: str = output_type.lower() if output_type is not None else ""
ftype_uppercase: str = output_type.upper() if output_type is not None else ""
return filename.format(
ftype_lowercase,
outtype=ftype_lowercase,
ftype=ftype_lowercase,
OUTTYPE=ftype_uppercase,
FTYPE=ftype_uppercase,
)
def model_weight_count_rounded_notation(
model_params_count: int, min_digits: int = 2
) -> str:
if model_params_count > 1e12:
# Trillions Of Parameters
scaled_model_params = model_params_count * 1e-12
scale_suffix = "T"
elif model_params_count > 1e9:
# Billions Of Parameters
scaled_model_params = model_params_count * 1e-9
scale_suffix = "B"
elif model_params_count > 1e6:
# Millions Of Parameters
scaled_model_params = model_params_count * 1e-6
scale_suffix = "M"
else:
# Thousands Of Parameters
scaled_model_params = model_params_count * 1e-3
scale_suffix = "K"
fix = max(min_digits - len(str(round(scaled_model_params)).lstrip("0")), 0)
return f"{scaled_model_params:.{fix}f}{scale_suffix}"
def size_label(
total_params: int, shared_params: int, expert_params: int, expert_count: int
) -> str:
if expert_count > 0:
pretty_size = model_weight_count_rounded_notation(
abs(shared_params) + abs(expert_params), min_digits=2
)
size_class = f"{expert_count}x{pretty_size}"
else:
size_class = model_weight_count_rounded_notation(
abs(total_params), min_digits=2
)
return size_class
def naming_convention(
model_name: str | None,
base_name: str | None,
finetune_string: str | None,
version_string: str | None,
size_label: str | None,
output_type: str | None,
model_type: Literal["vocab", "LoRA"] | None = None,
) -> str:
# Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
if base_name is not None:
name = base_name.strip().replace(" ", "-").replace("/", "-")
elif model_name is not None:
name = model_name.strip().replace(" ", "-").replace("/", "-")
else:
name = "ggml-model"
parameters = f"-{size_label}" if size_label is not None else ""
finetune = (
f"-{finetune_string.strip().replace(' ', '-')}"
if finetune_string is not None
else ""
)
version = (
f"-{version_string.strip().replace(' ', '-')}"
if version_string is not None
else ""
)
encoding = (
f"-{output_type.strip().replace(' ', '-').upper()}"
if output_type is not None
else ""
)
kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""
return f"{name}{parameters}{finetune}{version}{encoding}{kind}"

View File

@ -157,36 +157,8 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer = json.load(f) tokenizer = json.load(f)
if self.load_merges: if self.load_merges:
merges = tokenizer.get("model", {}).get("merges") merges = tokenizer.get("model", {}).get("merges")
if isinstance(merges, list) and merges: if isinstance(merges, list) and merges and isinstance(merges[0], str):
if isinstance(merges[0], str):
self.merges = merges self.merges = merges
elif (
isinstance(merges[0], list)
and len(merges[0]) == 2
and isinstance(merges[0][0], str)
):
# New format since transformers 4.45 to support spaces in merges
# ref: https://github.com/ggml-org/llama.cpp/issues/9692
# TODO: internally store as the new format instead of converting to old
if any(" " in s for pair in merges for s in pair):
logger.warning(
f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}'
)
self.merges = [
" ".join(
[
# ensure the spaces are properly encoded
"".join(
chr(ord(c) + 256) if c == " " else c
for c in part
)
for part in pair
]
)
for pair in merges
]
else:
raise ValueError("Unknown tokenizer merges format")
added_tokens = tokenizer.get("added_tokens", {}) added_tokens = tokenizer.get("added_tokens", {})
else: else:
added_tokens = {} added_tokens = {}
@ -195,12 +167,7 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
return True return True
with open(tokenizer_config_file, encoding="utf-8") as f: with open(tokenizer_config_file, encoding="utf-8") as f:
tokenizer_config = json.load(f) tokenizer_config = json.load(f)
chat_template_alt = None chat_template = tokenizer_config.get("chat_template")
chat_template_file = path / "chat_template.json"
if chat_template_file.is_file():
with open(chat_template_file, encoding="utf-8") as f:
chat_template_alt = json.load(f).get("chat_template")
chat_template = tokenizer_config.get("chat_template", chat_template_alt)
if chat_template is None or isinstance(chat_template, (str, list)): if chat_template is None or isinstance(chat_template, (str, list)):
self.chat_template = chat_template self.chat_template = chat_template
else: else:
@ -257,8 +224,11 @@ class Vocab(BaseVocab, Protocol):
added_tokens_list: list[str] added_tokens_list: list[str]
fname_tokenizer: Path fname_tokenizer: Path
def __init__(self, base_path: Path): ... def __init__(self, base_path: Path):
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ... ...
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
...
class NoVocab(BaseVocab): class NoVocab(BaseVocab):

File diff suppressed because it is too large Load Diff

View File

@ -1,316 +0,0 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Literal
import os
import json
def fill_templated_filename(filename: str, output_type: str | None) -> str:
# Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf'
ftype_lowercase: str = output_type.lower() if output_type is not None else ""
ftype_uppercase: str = output_type.upper() if output_type is not None else ""
return filename.format(
ftype_lowercase,
outtype=ftype_lowercase,
ftype=ftype_lowercase,
OUTTYPE=ftype_uppercase,
FTYPE=ftype_uppercase,
)
def model_weight_count_rounded_notation(
model_params_count: int, min_digits: int = 2
) -> str:
if model_params_count > 1e12:
# Trillions Of Parameters
scaled_model_params = model_params_count * 1e-12
scale_suffix = "T"
elif model_params_count > 1e9:
# Billions Of Parameters
scaled_model_params = model_params_count * 1e-9
scale_suffix = "B"
elif model_params_count > 1e6:
# Millions Of Parameters
scaled_model_params = model_params_count * 1e-6
scale_suffix = "M"
else:
# Thousands Of Parameters
scaled_model_params = model_params_count * 1e-3
scale_suffix = "K"
fix = max(min_digits - len(str(round(scaled_model_params)).lstrip("0")), 0)
return f"{scaled_model_params:.{fix}f}{scale_suffix}"
def size_label(
total_params: int, shared_params: int, expert_params: int, expert_count: int
) -> str:
if expert_count > 0:
pretty_size = model_weight_count_rounded_notation(
abs(shared_params) + abs(expert_params), min_digits=2
)
size_class = f"{expert_count}x{pretty_size}"
else:
size_class = model_weight_count_rounded_notation(
abs(total_params), min_digits=2
)
return size_class
def naming_convention(
model_name: str | None,
base_name: str | None,
finetune_string: str | None,
version_string: str | None,
size_label: str | None,
output_type: str | None,
model_type: Literal["vocab", "LoRA"] | None = None,
) -> str:
# Reference: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#gguf-naming-convention
if base_name is not None:
name = base_name.strip().replace(" ", "-").replace("/", "-")
elif model_name is not None:
name = model_name.strip().replace(" ", "-").replace("/", "-")
else:
name = "ggml-model"
parameters = f"-{size_label}" if size_label is not None else ""
finetune = (
f"-{finetune_string.strip().replace(' ', '-')}"
if finetune_string is not None
else ""
)
version = (
f"-{version_string.strip().replace(' ', '-')}"
if version_string is not None
else ""
)
encoding = (
f"-{output_type.strip().replace(' ', '-').upper()}"
if output_type is not None
else ""
)
kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""
return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
@dataclass
class RemoteTensor:
dtype: str
shape: tuple[int, ...]
offset_start: int
size: int
url: str
def data(self) -> bytearray:
# TODO: handle request errors (maybe with limited retries?)
# NOTE: using a bytearray, otherwise PyTorch complains the buffer is not writeable
data = bytearray(
SafetensorRemote.get_data_by_range(
url=self.url, start=self.offset_start, size=self.size
)
)
return data
class SafetensorRemote:
"""
Uility class to handle remote safetensor files.
This class is designed to work with Hugging Face model repositories.
Example (one model has single safetensor file, the other has multiple):
for model_id in ["ngxson/TEST-Tiny-Llama4", "Qwen/Qwen2.5-7B-Instruct"]:
tensors = SafetensorRemote.get_list_tensors_hf_model(model_id)
print(tensors)
Example reading tensor data:
tensors = SafetensorRemote.get_list_tensors_hf_model(model_id)
for name, meta in tensors.items():
dtype, shape, offset_start, size, remote_safetensor_url = meta
# read the tensor data
data = SafetensorRemote.get_data_by_range(remote_safetensor_url, offset_start, size)
print(data)
"""
BASE_DOMAIN = "https://huggingface.co"
ALIGNMENT = 8 # bytes
@classmethod
def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]:
"""
Get list of tensors from a Hugging Face model repository.
Returns a dictionary of tensor names and their metadata.
Each tensor is represented as a tuple of (dtype, shape, offset_start, size, remote_safetensor_url)
"""
# case 1: model has only one single model.safetensor file
is_single_file = cls.check_file_exist(
f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors"
)
if is_single_file:
url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors"
return cls.get_list_tensors(url)
# case 2: model has multiple files
index_url = (
f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors.index.json"
)
is_multiple_files = cls.check_file_exist(index_url)
if is_multiple_files:
# read the index file
index_data = cls.get_data_by_range(index_url, 0)
index_str = index_data.decode("utf-8")
index_json = json.loads(index_str)
assert (
index_json.get("weight_map") is not None
), "weight_map not found in index file"
weight_map = index_json["weight_map"]
# get the list of files
all_files = list(set(weight_map.values()))
all_files.sort() # make sure we load shard files in order
# get the list of tensors
tensors: dict[str, RemoteTensor] = {}
for file in all_files:
url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/{file}"
for key, val in cls.get_list_tensors(url).items():
tensors[key] = val
return tensors
raise ValueError(f"Model {model_id} does not have any safetensor files")
@classmethod
def get_list_tensors(cls, url: str) -> dict[str, RemoteTensor]:
"""
Get list of tensors from a remote safetensor file.
Returns a dictionary of tensor names and their metadata.
Each tensor is represented as a tuple of (dtype, shape, offset_start, size)
"""
metadata, data_start_offset = cls.get_metadata(url)
res: dict[str, RemoteTensor] = {}
for name, meta in metadata.items():
if name == "__metadata__":
continue
if not isinstance(meta, dict):
raise ValueError(f"Invalid metadata for tensor '{name}': {meta}")
try:
dtype = meta["dtype"]
shape = meta["shape"]
offset_start_relative, offset_end_relative = meta["data_offsets"]
size = offset_end_relative - offset_start_relative
offset_start = data_start_offset + offset_start_relative
res[name] = RemoteTensor(
dtype=dtype,
shape=tuple(shape),
offset_start=offset_start,
size=size,
url=url,
)
except KeyError as e:
raise ValueError(
f"Missing key in metadata for tensor '{name}': {e}, meta = {meta}"
)
return res
@classmethod
def get_metadata(cls, url: str) -> tuple[dict, int]:
"""
Get JSON metadata from a remote safetensor file.
Returns tuple of (metadata, data_start_offset)
"""
# Request first 5MB of the file (hopefully enough for metadata)
read_size = 5 * 1024 * 1024
raw_data = cls.get_data_by_range(url, 0, read_size)
# Parse header
# First 8 bytes contain the metadata length as u64 little-endian
if len(raw_data) < 8:
raise ValueError("Not enough data to read metadata size")
metadata_length = int.from_bytes(raw_data[:8], byteorder="little")
# Calculate the data start offset
data_start_offset = 8 + metadata_length
alignment = SafetensorRemote.ALIGNMENT
if data_start_offset % alignment != 0:
data_start_offset += alignment - (data_start_offset % alignment)
# Check if we have enough data to read the metadata
if len(raw_data) < 8 + metadata_length:
raise ValueError(
f"Could not read complete metadata. Need {8 + metadata_length} bytes, got {len(raw_data)}"
)
# Extract metadata bytes and parse as JSON
metadata_bytes = raw_data[8 : 8 + metadata_length]
metadata_str = metadata_bytes.decode("utf-8")
try:
metadata = json.loads(metadata_str)
return metadata, data_start_offset
except json.JSONDecodeError as e:
raise ValueError(f"Failed to parse safetensor metadata as JSON: {e}")
@classmethod
def get_data_by_range(cls, url: str, start: int, size: int = -1) -> bytes:
"""
Get raw byte data from a remote file by range.
If size is not specified, it will read the entire file.
"""
import requests
from urllib.parse import urlparse
parsed_url = urlparse(url)
if not parsed_url.scheme or not parsed_url.netloc:
raise ValueError(f"Invalid URL: {url}")
headers = cls._get_request_headers()
if size > -1:
headers["Range"] = f"bytes={start}-{start + size}"
response = requests.get(url, allow_redirects=True, headers=headers)
response.raise_for_status()
# Get raw byte data
return response.content[:size]
@classmethod
def check_file_exist(cls, url: str) -> bool:
"""
Check if a file exists at the given URL.
Returns True if the file exists, False otherwise.
"""
import requests
from urllib.parse import urlparse
parsed_url = urlparse(url)
if not parsed_url.scheme or not parsed_url.netloc:
raise ValueError(f"Invalid URL: {url}")
try:
headers = cls._get_request_headers()
headers["Range"] = "bytes=0-0"
response = requests.head(url, allow_redirects=True, headers=headers)
# Success (2xx) or redirect (3xx)
return 200 <= response.status_code < 400
except requests.RequestException:
return False
@classmethod
def _get_request_headers(cls) -> dict[str, str]:
"""Prepare common headers for requests."""
headers = {"User-Agent": "convert_hf_to_gguf"}
if os.environ.get("HF_TOKEN"):
headers["Authorization"] = f"Bearer {os.environ['HF_TOKEN']}"
return headers

View File

@ -1,123 +0,0 @@
import os
import re
import sys
from typing import Any, IO, List, TextIO, Union
from PySide6.QtWidgets import (
QMessageBox,
)
from Localizations import (
DOTENV_FILE_NOT_FOUND,
COULD_NOT_PARSE_LINE,
ERROR_LOADING_DOTENV,
AUTOGGUF_VERSION,
)
def verify_gguf(file_path) -> bool:
try:
with open(file_path, "rb") as f:
magic = f.read(4)
return magic == b"GGUF"
except (FileNotFoundError, IOError, OSError):
return False
def process_args(args: List[str]) -> bool:
try:
i = 1
while i < len(args):
key = (
args[i][2:].replace("-", "_").upper()
) # Strip the first two '--' and replace '-' with '_'
if i + 1 < len(args) and not args[i + 1].startswith("--"):
value = args[i + 1]
i += 2
else:
value = "enabled"
i += 1
os.environ[key] = value
return True
except Exception:
return False
def load_dotenv(self=Any) -> None:
if not os.path.isfile(".env"):
self.logger.warning(DOTENV_FILE_NOT_FOUND)
return
try:
with open(".env") as f:
for line in f:
# Strip leading/trailing whitespace
line = line.strip()
# Ignore comments and empty lines
if not line or line.startswith("#"):
continue
# Match key-value pairs (unquoted and quoted values)
match = re.match(r"^([^=]+)=(.*)$", line)
if not match:
self.logger.warning(COULD_NOT_PARSE_LINE.format(line))
continue
key, value = match.groups()
# Remove any surrounding quotes from the value
if value.startswith(("'", '"')) and value.endswith(("'", '"')):
value = value[1:-1]
# Decode escape sequences
value = bytes(value, "utf-8").decode("unicode_escape")
# Set the environment variable
os.environ[key.strip()] = value.strip()
except Exception as e:
self.logger.error(ERROR_LOADING_DOTENV.format(e))
def show_about(self) -> None:
about_text = f"""AutoGGUF
Version: {AUTOGGUF_VERSION}
A tool for managing and converting GGUF models.
This application is licensed under the Apache License 2.0.
Copyright (c) 2024-2025 leafspark.
It also utilizes llama.cpp, licensed under the MIT License.
Copyright (c) 2023-2025 The ggml authors."""
QMessageBox.about(self, "About AutoGGUF", about_text)
def ensure_directory(path) -> None:
if not os.path.exists(path):
os.makedirs(path)
def open_file_safe(file_path, mode="r") -> IO[Any]:
encodings = ["utf-8", "latin-1", "ascii", "utf-16"]
for encoding in encodings:
try:
return open(file_path, mode, encoding=encoding)
except UnicodeDecodeError:
continue
raise ValueError(
f"Unable to open file {file_path} with any of the encodings: {encodings}"
)
def resource_path(relative_path) -> Union[str, str, bytes]:
if hasattr(sys, "_MEIPASS"):
# PyInstaller path
base_path = sys._MEIPASS
elif "__compiled__" in globals():
# Nuitka path
base_path = os.path.dirname(sys.executable)
else:
# Regular Python path
base_path = os.path.abspath(".")
return os.path.join(base_path, relative_path)

View File

@ -0,0 +1,81 @@
import os
import sys
import psutil
import subprocess
import time
import signal
import json
import platform
import requests
import zipfile
from datetime import datetime
from PySide6.QtWidgets import (
QApplication,
QMainWindow,
QVBoxLayout,
QHBoxLayout,
QWidget,
QPushButton,
QListWidget,
QLineEdit,
QLabel,
QFileDialog,
QProgressBar,
QComboBox,
QTextEdit,
QCheckBox,
QGroupBox,
QFormLayout,
QScrollArea,
QSlider,
QSpinBox,
QListWidgetItem,
QMessageBox,
QDialog,
QPlainTextEdit,
QMenu,
)
from PySide6.QtCore import QTimer, Signal, QThread, Qt, QSize
from PySide6.QtGui import QCloseEvent, QAction
from Localizations import *
def show_about(self):
about_text = (
"AutoGGUF\n\n"
f"Version: {AUTOGGUF_VERSION}\n\n"
"A tool for managing and converting GGUF models."
)
QMessageBox.about(self, "About AutoGGUF", about_text)
def ensure_directory(path):
if not os.path.exists(path):
os.makedirs(path)
def open_file_safe(file_path, mode="r"):
encodings = ["utf-8", "latin-1", "ascii", "utf-16"]
for encoding in encodings:
try:
return open(file_path, mode, encoding=encoding)
except UnicodeDecodeError:
continue
raise ValueError(
f"Unable to open file {file_path} with any of the encodings: {encodings}"
)
def resource_path(relative_path):
if hasattr(sys, "_MEIPASS"):
# PyInstaller path
base_path = sys._MEIPASS
elif "__compiled__" in globals():
# Nuitka path
base_path = os.path.dirname(sys.executable)
else:
# Regular Python path
base_path = os.path.abspath(".")
return os.path.join(base_path, relative_path)

View File

@ -1,166 +1,15 @@
from datetime import datetime from datetime import datetime
from PySide6.QtWidgets import ( from PySide6.QtWidgets import QListWidgetItem
QFileDialog,
QHBoxLayout,
QLineEdit,
QListWidgetItem,
QPushButton,
QWidget,
)
from QuantizationThread import QuantizationThread from QuantizationThread import QuantizationThread
from TaskListItem import TaskListItem from TaskListItem import TaskListItem
from error_handling import handle_error, show_error from error_handling import handle_error, show_error
from globals import ensure_directory from imports_and_globals import ensure_directory
from Localizations import * from Localizations import *
def export_lora(self) -> None: def convert_lora(self):
self.logger.info(STARTING_LORA_EXPORT)
try:
model_path = self.export_lora_model.text()
output_path = self.export_lora_output.text()
lora_adapters = []
for i in range(self.export_lora_adapters.count()):
item = self.export_lora_adapters.item(i)
adapter_widget = self.export_lora_adapters.itemWidget(item)
path_input = adapter_widget.layout().itemAt(0).widget()
scale_input = adapter_widget.layout().itemAt(1).widget()
adapter_path = path_input.text()
adapter_scale = scale_input.text()
lora_adapters.append((adapter_path, adapter_scale))
if not model_path:
raise ValueError(MODEL_PATH_REQUIRED)
if not output_path:
raise ValueError(OUTPUT_PATH_REQUIRED)
if not lora_adapters:
raise ValueError(AT_LEAST_ONE_LORA_ADAPTER_REQUIRED)
backend_path = self.backend_combo.currentData()
if not backend_path:
raise ValueError(NO_BACKEND_SELECTED)
command = [
os.path.join(backend_path, "llama-export-lora"),
"--model",
model_path,
"--output",
output_path,
]
for adapter_path, adapter_scale in lora_adapters:
if adapter_path:
if adapter_scale:
try:
scale_value = float(adapter_scale)
command.extend(
["--lora-scaled", adapter_path, str(scale_value)]
)
except ValueError:
raise ValueError(INVALID_LORA_SCALE_VALUE)
else:
command.extend(["--lora", adapter_path])
threads = self.export_lora_threads.value()
command.extend(["--threads", str(threads)])
logs_path = self.logs_input.text()
ensure_directory(logs_path)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(logs_path, f"lora_export_{timestamp}.log")
command_str = " ".join(command)
self.logger.info(f"{LORA_EXPORT_COMMAND}: {command_str}")
thread = QuantizationThread(command, backend_path, log_file)
self.quant_threads.append(thread)
task_item = TaskListItem(EXPORTING_LORA, log_file, show_progress_bar=False)
list_item = QListWidgetItem(self.task_list)
list_item.setSizeHint(task_item.sizeHint())
self.task_list.addItem(list_item)
self.task_list.setItemWidget(list_item, task_item)
thread.status_signal.connect(task_item.update_status)
thread.finished_signal.connect(lambda: self.task_finished(thread))
thread.error_signal.connect(
lambda err: handle_error(self.logger, err, task_item)
)
thread.start()
self.logger.info(LORA_EXPORT_TASK_STARTED)
except ValueError as e:
show_error(self.logger, str(e))
except Exception as e:
show_error(self.logger, ERROR_STARTING_LORA_EXPORT.format(str(e)))
def lora_conversion_finished(self, thread) -> None:
self.logger.info(LORA_CONVERSION_FINISHED)
if thread in self.quant_threads:
self.quant_threads.remove(thread)
def delete_lora_adapter_item(self, adapter_widget) -> None:
self.logger.info(DELETING_LORA_ADAPTER)
# Find the QListWidgetItem containing the adapter_widget
for i in range(self.export_lora_adapters.count()):
item = self.export_lora_adapters.item(i)
if self.export_lora_adapters.itemWidget(item) == adapter_widget:
self.export_lora_adapters.takeItem(i) # Remove the item
break
def browse_export_lora_model(self) -> None:
self.logger.info(BROWSING_FOR_EXPORT_LORA_MODEL_FILE)
model_file, _ = QFileDialog.getOpenFileName(self, SELECT_MODEL_FILE, "", GGUF_FILES)
if model_file:
self.export_lora_model.setText(os.path.abspath(model_file))
def browse_export_lora_output(self) -> None:
self.logger.info(BROWSING_FOR_EXPORT_LORA_OUTPUT_FILE)
output_file, _ = QFileDialog.getSaveFileName(
self, SELECT_OUTPUT_FILE, "", GGUF_FILES
)
if output_file:
self.export_lora_output.setText(os.path.abspath(output_file))
def add_lora_adapter(self) -> None:
self.logger.info(ADDING_LORA_ADAPTER)
adapter_path, _ = QFileDialog.getOpenFileName(
self, SELECT_LORA_ADAPTER_FILE, "", LORA_FILES
)
if adapter_path:
# Create a widget to hold the path and scale input
adapter_widget = QWidget()
adapter_layout = QHBoxLayout(adapter_widget)
path_input = QLineEdit(adapter_path)
path_input.setReadOnly(True)
adapter_layout.addWidget(path_input)
scale_input = QLineEdit("1.0") # Default scale value
adapter_layout.addWidget(scale_input)
delete_button = QPushButton(DELETE_ADAPTER)
delete_button.clicked.connect(
lambda: self.delete_lora_adapter_item(adapter_widget)
)
adapter_layout.addWidget(delete_button)
# Add the widget to the list
list_item = QListWidgetItem(self.export_lora_adapters)
list_item.setSizeHint(adapter_widget.sizeHint())
self.export_lora_adapters.addItem(list_item)
self.export_lora_adapters.setItemWidget(list_item, adapter_widget)
def convert_lora(self) -> None:
self.logger.info(STARTING_LORA_CONVERSION) self.logger.info(STARTING_LORA_CONVERSION)
try: try:
lora_input_path = self.lora_input.text() lora_input_path = self.lora_input.text()
@ -185,12 +34,7 @@ def convert_lora(self) -> None:
raise ValueError(BASE_MODEL_PATH_REQUIRED) raise ValueError(BASE_MODEL_PATH_REQUIRED)
command.extend(["--base", base_model_path]) command.extend(["--base", base_model_path])
else: # Use old GGML parameters for GGML else: # Use old GGML parameters for GGML
command = [ command = ["python", "src/convert_lora_to_ggml.py", lora_input_path]
"python",
"src/convert_lora_to_ggml.py",
lora_input_path,
lora_output_path,
]
logs_path = self.logs_input.text() logs_path = self.logs_input.text()
ensure_directory(logs_path) ensure_directory(logs_path)
@ -214,7 +58,11 @@ def convert_lora(self) -> None:
self.task_list.setItemWidget(list_item, task_item) self.task_list.setItemWidget(list_item, task_item)
thread.status_signal.connect(task_item.update_status) thread.status_signal.connect(task_item.update_status)
thread.finished_signal.connect(lambda: self.lora_conversion_finished(thread)) thread.finished_signal.connect(
lambda: self.lora_conversion_finished(
thread, lora_input_path, lora_output_path
)
)
thread.error_signal.connect( thread.error_signal.connect(
lambda err: handle_error(self.logger, err, task_item) lambda err: handle_error(self.logger, err, task_item)
) )

View File

@ -1,196 +1,61 @@
import os import os
import sys import sys
import threading import threading
from enum import Enum
from typing import List, Optional
from PySide6.QtCore import QTimer from PySide6.QtCore import QTimer
from PySide6.QtWidgets import QApplication from PySide6.QtWidgets import QApplication
from fastapi import FastAPI, Query, Depends, HTTPException, Security
from fastapi.security.api_key import APIKeyHeader
from pydantic import BaseModel, Field
from uvicorn import Config, Server
from AutoGGUF import AutoGGUF from AutoGGUF import AutoGGUF
from Localizations import AUTOGGUF_VERSION from flask import Flask, jsonify
app = FastAPI( server = Flask(__name__)
title="AutoGGUF",
description="API for AutoGGUF - automatically quant GGUF models",
version=AUTOGGUF_VERSION,
license_info={
"name": "Apache 2.0",
"url": "https://raw.githubusercontent.com/leafspark/AutoGGUF/main/LICENSE",
},
)
# Global variable to hold the window reference
window = None
class ModelType(str, Enum): @server.route("/v1/models", methods=["GET"])
single = "single" def models():
sharded = "sharded"
class Model(BaseModel):
name: str = Field(..., description="Name of the model")
type: str = Field(..., description="Type of the model")
path: str = Field(..., description="Path to the model file")
size: Optional[int] = Field(None, description="Size of the model in bytes")
class Config:
json_schema_extra = {
"example": {
"name": "Llama-3.1-8B-Instruct.fp16.gguf",
"type": "single",
"path": "Llama-3.1-8B-Instruct.fp16.gguf",
"size": 13000000000,
}
}
class Task(BaseModel):
# id: str = Field(..., description="Unique identifier for the task")
status: str = Field(..., description="Current status of the task")
progress: float = Field(..., description="Progress of the task as a percentage")
class Config:
json_json_schema_extra = {
"example": {"id": "task_123", "status": "running", "progress": 75.5}
}
class Backend(BaseModel):
name: str = Field(..., description="Name of the backend")
path: str = Field(..., description="Path to the backend executable")
class Plugin(BaseModel):
name: str = Field(..., description="Name of the plugin")
version: str = Field(..., description="Version of the plugin")
description: str = Field(..., description="Description of the plugin")
author: str = Field(..., description="Author of the plugin")
# API Key configuration
API_KEY_NAME = "Authorization"
api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False)
def get_api_key(
api_key_header: str = Security(api_key_header),
) -> Optional[str]:
api_key_env = os.getenv("AUTOGGUF_SERVER_API_KEY")
if not api_key_env:
return None # No API key restriction if not set
api_keys = [
key.strip() for key in api_key_env.split(",") if key.strip()
] # Split by comma and strip whitespace
if api_key_header and api_key_header.startswith("Bearer "):
api_key = api_key_header[len("Bearer ") :]
if api_key in api_keys:
return api_key
raise HTTPException(status_code=403, detail="Could not validate API key")
@app.get(
"/v1/models",
response_model=List[Model],
tags=["Models"],
dependencies=[Depends(get_api_key)],
)
async def get_models(
type: Optional[ModelType] = Query(None, description="Filter models by type")
) -> List[Model]:
if window: if window:
models = window.get_models_data() return jsonify({"models": window.get_models_data()})
if type: return jsonify({"models": []})
models = [m for m in models if m["type"] == type]
return [Model(**m) for m in models]
return []
@app.get( @server.route("/v1/tasks", methods=["GET"])
"/v1/tasks", def tasks():
response_model=List[Task],
tags=["Tasks"],
dependencies=[Depends(get_api_key)],
)
async def get_tasks() -> List[Task]:
if window: if window:
return window.get_tasks_data() return jsonify({"tasks": window.get_tasks_data()})
return [] return jsonify({"tasks": []})
@app.get("/v1/health", tags=["System"], dependencies=[Depends(get_api_key)]) @server.route("/v1/health", methods=["GET"])
async def health_check() -> dict: def ping():
return {"status": "alive"} return jsonify({"status": "alive"})
@app.get( @server.route("/v1/backends", methods=["GET"])
"/v1/backends", def get_backends():
response_model=List[Backend],
tags=["System"],
dependencies=[Depends(get_api_key)],
)
async def get_backends() -> List[Backend]:
backends = [] backends = []
if window:
for i in range(window.backend_combo.count()): for i in range(window.backend_combo.count()):
backends.append( backends.append(
Backend( {
name=window.backend_combo.itemText(i), "name": window.backend_combo.itemText(i),
path=window.backend_combo.itemData(i), "path": window.backend_combo.itemData(i),
}
) )
) return jsonify({"backends": backends})
return backends
@app.get( def run_flask():
"/v1/plugins", if os.environ.get("AUTOGGUF_SERVER", "").lower() == "true":
response_model=List[Plugin], server.run(
tags=["System"], host="0.0.0.0",
dependencies=[Depends(get_api_key)], port=int(os.environ.get("AUTOGGUF_SERVER_PORT", 5000)),
) debug=False,
async def get_plugins() -> List[Plugin]: use_reloader=False,
if window:
return [
Plugin(**plugin_data["data"]) for plugin_data in window.plugins.values()
]
return []
def run_uvicorn() -> None:
if os.environ.get("AUTOGGUF_SERVER", "").lower() == "enabled":
config = Config(
app=app,
host="127.0.0.1",
port=int(os.environ.get("AUTOGGUF_SERVER_PORT", 7001)),
log_level="info",
)
server = Server(config)
server.run()
def main() -> None:
global window
qt_app = QApplication(sys.argv)
window = AutoGGUF(sys.argv)
window.show()
# Start Uvicorn in a separate thread after a short delay
timer = QTimer()
timer.singleShot(
100, lambda: threading.Thread(target=run_uvicorn, daemon=True).start()
) )
sys.exit(qt_app.exec())
app = QApplication(sys.argv)
if __name__ == "__main__": window = AutoGGUF()
main() window.show()
# Start Flask in a separate thread after a short delay
timer = QTimer()
timer.singleShot(100, lambda: threading.Thread(target=run_flask, daemon=True).start())
sys.exit(app.exec())

View File

@ -1,118 +0,0 @@
import json
from PySide6.QtCore import Qt
from PySide6.QtWidgets import QApplication, QFileDialog, QMessageBox
from Localizations import (
SAVING_PRESET,
SAVE_PRESET,
JSON_FILES,
PRESET_SAVED,
PRESET_SAVED_TO,
LOADING_PRESET,
LOAD_PRESET,
PRESET_LOADED,
PRESET_LOADED_FROM,
)
def save_preset(self) -> None:
self.logger.info(SAVING_PRESET)
preset = {
"quant_types": [item.text() for item in self.quant_type.selectedItems()],
"allow_requantize": self.allow_requantize.isChecked(),
"leave_output_tensor": self.leave_output_tensor.isChecked(),
"pure": self.pure.isChecked(),
"imatrix": self.imatrix.text(),
"include_weights": self.include_weights.text(),
"exclude_weights": self.exclude_weights.text(),
"use_output_tensor_type": self.use_output_tensor_type.isChecked(),
"output_tensor_type": self.output_tensor_type.currentText(),
"use_token_embedding_type": self.use_token_embedding_type.isChecked(),
"token_embedding_type": self.token_embedding_type.currentText(),
"keep_split": self.keep_split.isChecked(),
"kv_overrides": [
entry.get_raw_override_string() for entry in self.kv_override_entries
],
"extra_arguments": self.extra_arguments.text(),
}
if not QApplication.keyboardModifiers() & Qt.ShiftModifier:
file_name, _ = QFileDialog.getSaveFileName(self, SAVE_PRESET, "", JSON_FILES)
if file_name:
with open(file_name, "w") as f:
json.dump(preset, f, indent=4)
QMessageBox.information(
self, PRESET_SAVED, PRESET_SAVED_TO.format(file_name)
)
self.logger.info(PRESET_SAVED_TO.format(file_name))
else:
clipboard = QApplication.clipboard()
preset_str = json.dumps(preset, indent=1)
clipboard.setText(preset_str)
QMessageBox.information(self, PRESET_SAVED, "Preset copied to clipboard")
self.logger.info("Preset copied to clipboard")
def load_preset(self) -> None:
self.logger.info(LOADING_PRESET)
try:
if QApplication.keyboardModifiers() & Qt.ShiftModifier:
clipboard = QApplication.clipboard()
preset = json.loads(clipboard.text())
source = "clipboard"
else:
file_name, _ = QFileDialog.getOpenFileName(
self, LOAD_PRESET, "", JSON_FILES
)
if not file_name:
return
with open(file_name, "r") as f:
preset = json.load(f)
source = file_name
self.quant_type.clearSelection()
for quant_type in preset.get("quant_types", []):
items = self.quant_type.findItems(quant_type, Qt.MatchExactly)
if items:
items[0].setSelected(True)
self.allow_requantize.setChecked(preset.get("allow_requantize", False))
self.leave_output_tensor.setChecked(preset.get("leave_output_tensor", False))
self.pure.setChecked(preset.get("pure", False))
self.imatrix.setText(preset.get("imatrix", ""))
self.include_weights.setText(preset.get("include_weights", ""))
self.exclude_weights.setText(preset.get("exclude_weights", ""))
self.use_output_tensor_type.setChecked(
preset.get("use_output_tensor_type", False)
)
self.output_tensor_type.setCurrentText(preset.get("output_tensor_type", ""))
self.use_token_embedding_type.setChecked(
preset.get("use_token_embedding_type", False)
)
self.token_embedding_type.setCurrentText(preset.get("token_embedding_type", ""))
self.keep_split.setChecked(preset.get("keep_split", False))
self.extra_arguments.setText(preset.get("extra_arguments", ""))
# Clear existing KV overrides and add new ones
for entry in self.kv_override_entries:
self.remove_kv_override(entry)
for override in preset.get("kv_overrides", []):
self.add_kv_override(override)
QMessageBox.information(
self,
PRESET_LOADED,
PRESET_LOADED_FROM.format(
source
if not QApplication.keyboardModifiers() & Qt.ShiftModifier
else "clipboard"
),
)
self.logger.info(PRESET_LOADED_FROM.format(source))
except json.JSONDecodeError:
QMessageBox.critical(self, "Error", "Invalid JSON in clipboard")
self.logger.error("Failed to parse JSON from clipboard")
except Exception as e:
QMessageBox.critical(self, "Error", f"Failed to load preset: {str(e)}")
self.logger.error(f"Failed to load preset: {str(e)}")

View File

@ -1,559 +0,0 @@
import copy
import gc
import re
import sys
from typing import List
from typing import Optional, Tuple
import torch
import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
# https://github.com/neuralmagic/AutoFP8
class BaseQuantizeConfig:
"""Configuration for model quantization.
Args:
quant_method: Type/precision of quantization method to use.
At the moment, this is just "fp8" which specifically means
the fp8_e4m3 format in pytorch.
activation_scheme: Choice of either "dynamic" or "static" quantization
of activtions. If "static", then calibration samples are required
during quantization to produce accurate per-tensor scales for
activations of Linear modules.
ignore_patterns: List of patterns used to ignore layers. If a string
starts with "re:", then everything afterward is used as python
regex style matching i.e. re.search(), for each Linear layer.
By default, "re:.*lm_head" is included to ignore the embedding
Linear layer usually at the end of decoder LLMs
kv_cache_quant_targets: Tuple of Linear module names to target for
calibration of the output scales for KV cache quantization.
Usually, these should be `("k_proj", "v_proj")`.
"""
def __init__(
self,
quant_method: str = "fp8",
activation_scheme: str = "static",
ignore_patterns: List[str] = ["re:.*lm_head"],
kv_cache_quant_targets: Optional[Tuple[str]] = None,
):
if quant_method != "fp8":
raise ValueError("Only FP8 quantization is supported.")
if activation_scheme not in ["static", "dynamic"]:
raise ValueError(
"Invalid activation_scheme. Choose either 'static' or 'dynamic'."
)
self.quant_method = quant_method
self.activation_scheme = activation_scheme
self.ignore_patterns = ignore_patterns
self.kv_cache_quant_targets = kv_cache_quant_targets
self.ignored_layers = []
# Class responsible for quantizing weights
class FP8DynamicLinear(torch.nn.Module):
def __init__(
self,
weight: torch.Tensor,
weight_scale: torch.Tensor,
bias: torch.nn.Parameter,
):
super().__init__()
self.weight = torch.nn.Parameter(weight, requires_grad=False)
self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
self.bias = bias
def forward(self, x):
qinput, x_scale = per_tensor_quantize(x)
output = fp8_gemm(
A=qinput,
A_scale=x_scale,
B=self.weight,
B_scale=self.weight_scale,
bias=self.bias,
out_dtype=x.dtype,
)
return output
# Module responsible for taking already quantized weights, and recording input scales (and possibly output scales)
# using an activation observer
class FP8StaticLinearQuantizer(torch.nn.Module):
def __init__(
self,
weight: torch.Tensor,
weight_scale: torch.Tensor,
bias: torch.nn.Parameter,
quantize_output: bool = False,
):
super().__init__()
self.weight = torch.nn.Parameter(weight, requires_grad=False)
self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
self.bias = bias
self.input_scale = None
self.output_scale = None
self.quantize_output = quantize_output
def forward(self, x):
qinput, x_input_scale = per_tensor_quantize(x)
if self.input_scale is None:
self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
elif x_input_scale > self.input_scale:
self.input_scale = torch.nn.Parameter(x_input_scale, requires_grad=False)
output = fp8_gemm(
A=qinput,
A_scale=self.input_scale,
B=self.weight,
B_scale=self.weight_scale,
bias=self.bias,
out_dtype=x.dtype,
)
# Optionally, quantize output and record scale
if self.quantize_output:
qoutput, output_scale = per_tensor_quantize(output)
if self.output_scale is None:
self.output_scale = torch.nn.Parameter(
output_scale, requires_grad=False
)
elif output_scale > self.output_scale:
self.output_scale = torch.nn.Parameter(
output_scale, requires_grad=False
)
output = qoutput.to(output.dtype) * output_scale
return output
# Module responsible for representing the final checkpoint representation
class FP8StaticLinear(torch.nn.Module):
def __init__(
self,
weight: torch.nn.Parameter,
weight_scale: torch.nn.Parameter,
bias: torch.nn.Parameter,
input_scale: torch.nn.Parameter,
output_scale: Optional[torch.nn.Parameter] = None,
):
super().__init__()
self.weight = weight
self.weight_scale = weight_scale
self.bias = bias
self.input_scale = input_scale
self.output_scale = output_scale
def forward(self, x):
qinput = static_per_tensor_quantize(x, self.input_scale)
output = fp8_gemm(
A=qinput,
A_scale=self.input_scale,
B=self.weight,
B_scale=self.weight_scale,
bias=self.bias,
out_dtype=x.dtype,
)
if self.output_scale:
qoutput = static_per_tensor_quantize(output, self.output_scale)
output = qoutput.to(output.dtype) * self.output_scale
return output
class AutoFP8ForCausalLM:
def __init__(
self,
model: AutoModelForCausalLM,
quantize_config: BaseQuantizeConfig,
):
self.model = model
self.model_type = self.model.config.model_type
self.config = self.model.config
# Gather the Linear module names that we want to ignore
quantize_config.ignored_layers = get_layers_to_ignore(
self.model, quantize_config.ignore_patterns
)
if quantize_config.kv_cache_quant_targets:
kv_cache_quant_layers = get_kv_cache_quant_layers(
self.model, quantize_config.kv_cache_quant_targets
)
if len(kv_cache_quant_layers) == 0:
raise ValueError(
f"Could not find any kv cache layers using kv_cache_quant_targets={quantize_config.kv_cache_quant_targets}, please fix your argument."
)
quantize_config.kv_cache_quant_layers = kv_cache_quant_layers
self.quantize_config = quantize_config
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: str,
quantize_config: BaseQuantizeConfig,
**model_init_kwargs,
):
"""Load the un-quantized pretrained model"""
def skip(*args, **kwargs):
pass
torch.nn.init.kaiming_uniform_ = skip
torch.nn.init.uniform_ = skip
torch.nn.init.normal_ = skip
# Parameters related to loading from Hugging Face Hub
cache_dir = model_init_kwargs.pop("cache_dir", None)
force_download = model_init_kwargs.pop("force_download", False)
resume_download = model_init_kwargs.pop("resume_download", False)
proxies = model_init_kwargs.pop("proxies", None)
local_files_only = model_init_kwargs.pop("local_files_only", False)
use_auth_token = model_init_kwargs.pop("use_auth_token", None)
revision = model_init_kwargs.pop("revision", None)
subfolder = model_init_kwargs.pop("subfolder", "")
commit_hash = model_init_kwargs.pop("_commit_hash", None)
cached_file_kwargs = {
"cache_dir": cache_dir,
"force_download": force_download,
"proxies": proxies,
"resume_download": resume_download,
"local_files_only": local_files_only,
"use_auth_token": use_auth_token,
"revision": revision,
"subfolder": subfolder,
"_commit_hash": commit_hash,
}
torch.cuda.empty_cache()
# Important defaults
if "torch_dtype" not in model_init_kwargs:
model_init_kwargs["torch_dtype"] = "auto"
if "device_map" not in model_init_kwargs:
model_init_kwargs["device_map"] = "auto"
merged_kwargs = {**model_init_kwargs, **cached_file_kwargs}
print("Loading model with the following kwargs:", merged_kwargs)
model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path, **merged_kwargs
)
model_config = model.config.to_dict()
seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
if any(k in model_config for k in seq_len_keys):
for key in seq_len_keys:
if key in model_config:
model.seqlen = model_config[key]
break
else:
print("Can't get model's sequence length, setting to 2048.")
model.seqlen = 2048
model.eval()
return cls(model, quantize_config)
def quantize(self, calibration_tokens: Optional[torch.Tensor] = None):
# Always quantize the weights as they do not require calibration data
quantize_weights(self.model, self.quantize_config)
if self.quantize_config.activation_scheme == "static":
assert (
calibration_tokens is not None
), "Calibration tokens required for activation quantization"
def _prepare_calibration_data(calibration_tokens):
if hasattr(calibration_tokens, "input_ids"):
return calibration_tokens.input_ids
return calibration_tokens
quantize_activations(
self.model,
self.quantize_config,
_prepare_calibration_data(calibration_tokens),
)
def save_quantized(self, save_dir):
save_quantized_model(
self.model,
quant_config=self.quantize_config,
save_dir=save_dir,
)
def cleanup_memory():
gc.collect()
torch.cuda.empty_cache()
def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
"""Quantize a tensor using per-tensor static scaling factor.
Args:
tensor: The input tensor.
"""
finfo = torch.finfo(torch.float8_e4m3fn)
# Calculate the scale as dtype max divided by absmax.
# Since .abs() creates a new tensor, we use aminmax to get
# the min and max first and then calculate the absmax.
if tensor.numel() == 0:
# Deal with empty tensors (triggered by empty MoE experts)
min_val, max_val = (
torch.tensor(-16.0, dtype=tensor.dtype),
torch.tensor(16.0, dtype=tensor.dtype),
)
else:
min_val, max_val = tensor.aminmax()
amax = torch.maximum(min_val.abs(), max_val.abs())
scale = finfo.max / amax.clamp(min=1e-12)
# Scale and clamp the tensor to bring it to
# the representative range of float8 data type
# (as default cast is unsaturated)
qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
# Return both float8 data and the inverse scale (as float),
# as both required as inputs to torch._scaled_mm
qweight = qweight.to(torch.float8_e4m3fn)
scale = scale.float().reciprocal()
return qweight, scale
def static_per_tensor_quantize(tensor: torch.Tensor, inv_scale: float) -> torch.Tensor:
finfo = torch.finfo(torch.float8_e4m3fn)
qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
return qweight.to(torch.float8_e4m3fn)
def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype):
if A.numel() == 0:
# Deal with empty tensors (triggeted by empty MoE experts)
return torch.empty(size=(0, B.shape[0]), dtype=out_dtype, device=A.device)
# TODO: Disable native fp8 gemm for now, always just dequantize
# native_fp8_support = (
# torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9)
# )
native_fp8_support = False
if native_fp8_support:
need_reshape = A.dim() == 3
if need_reshape:
batch_size = A.shape[0]
A_input = A.reshape(-1, A.shape[-1])
else:
batch_size = None
A_input = A
output, _ = torch._scaled_mm(
A_input,
B.t(),
out_dtype=out_dtype,
scale_a=A_scale,
scale_b=B_scale,
bias=bias,
)
if need_reshape:
output = output.reshape(
batch_size, output.shape[0] // batch_size, output.shape[1]
)
else:
output = torch.nn.functional.linear(
A.to(out_dtype) * A_scale,
B.to(out_dtype) * B_scale.to(out_dtype),
bias=bias,
)
return output
def replace_module(model: AutoModelForCausalLM, name: str, new_module: torch.nn.Module):
if "." in name:
parent_name = name.rsplit(".", 1)[0]
child_name = name[len(parent_name) + 1 :]
parent = model.get_submodule(parent_name)
else:
parent_name = ""
parent = model
child_name = name
setattr(parent, child_name, new_module)
def quantize_weights(
model: AutoModelForCausalLM,
quantize_config: BaseQuantizeConfig,
):
named_modules = list(model.named_modules())
for name, linear in tqdm.tqdm(named_modules, desc="Quantizing weights"):
if (
not isinstance(linear, torch.nn.Linear)
or name in quantize_config.ignored_layers
):
continue
quant_weight, weight_scale = per_tensor_quantize(linear.weight)
bias = copy.deepcopy(linear.bias) if linear.bias is not None else None
quant_linear = FP8DynamicLinear(
weight=quant_weight, weight_scale=weight_scale, bias=bias
)
replace_module(model, name, quant_linear)
del linear.weight
del linear.bias
del linear
cleanup_memory()
def quantize_activations(
model: AutoModelForCausalLM,
quantize_config: BaseQuantizeConfig,
calibration_tokens,
):
# Replace weight quantizer with a dynamic activation quantizer observer
for name, dynamic_quant_linear in model.named_modules():
if (
not isinstance(dynamic_quant_linear, FP8DynamicLinear)
or name in quantize_config.ignored_layers
):
continue
quantizer = FP8StaticLinearQuantizer(
weight=dynamic_quant_linear.weight,
weight_scale=dynamic_quant_linear.weight_scale,
bias=dynamic_quant_linear.bias,
quantize_output=(
hasattr(quantize_config, "kv_cache_quant_layers")
and name in quantize_config.kv_cache_quant_layers
),
)
replace_module(model, name, quantizer)
del dynamic_quant_linear
cleanup_memory()
# Pass through calibration data to measure activation scales
with torch.inference_mode():
with tqdm.tqdm(
total=calibration_tokens.shape[0], desc="Calibrating activation scales"
) as pbar:
for row_idx in range(calibration_tokens.shape[0]):
model(calibration_tokens[row_idx].reshape(1, -1))
cleanup_memory()
pbar.update(1)
# Replace dynamic quantizer observer with StaticLinear for export
for name, quantizer in model.named_modules():
if (
not isinstance(quantizer, FP8StaticLinearQuantizer)
or name in quantize_config.ignored_layers
):
continue
static_proj = FP8StaticLinear(
weight=quantizer.weight,
weight_scale=quantizer.weight_scale,
bias=quantizer.bias,
input_scale=quantizer.input_scale,
output_scale=quantizer.output_scale,
)
replace_module(model, name, static_proj)
del quantizer
cleanup_memory()
# Post-process step for kv cache scales to take the k/v module
# `output_scale` parameters, and store them in the parent attention
# module as `k_scale` and `v_scale`
if hasattr(quantize_config, "kv_cache_quant_layers"):
# Assumes that list is ordered such that [layer0.k_proj, layer0.v_proj, layer1.k_proj, layer1.v_proj, ...]
# so we make a list of tuples [(layer0.k_proj, layer0.v_proj), (layer1.k_proj, layer1.v_proj), ...]
kv_proj_pairs = zip(*[iter(quantize_config.kv_cache_quant_layers)] * 2)
for k_proj_name, v_proj_name in kv_proj_pairs:
parent_module_name = ".".join(k_proj_name.split(".")[:-1])
assert parent_module_name == ".".join(v_proj_name.split(".")[:-1])
parent_module = dict(model.named_modules())[parent_module_name]
k_proj = dict(model.named_modules())[k_proj_name]
v_proj = dict(model.named_modules())[v_proj_name]
parent_module.k_scale = torch.nn.Parameter(
k_proj.output_scale, requires_grad=False
)
parent_module.v_scale = torch.nn.Parameter(
v_proj.output_scale, requires_grad=False
)
# Remove output_scale from k_proj and v_proj
k_proj.output_scale = None
v_proj.output_scale = None
cleanup_memory()
def save_quantized_model(
model: AutoModelForCausalLM,
quant_config: BaseQuantizeConfig,
save_dir: str,
):
print(model)
print(f"Saving the model to {save_dir}")
static_q_dict = {
"quantization_config": {
"quant_method": "fp8",
"activation_scheme": quant_config.activation_scheme,
"ignored_layers": quant_config.ignored_layers,
}
}
if hasattr(quant_config, "kv_cache_quant_layers"):
static_q_dict["quantization_config"]["kv_cache_scheme"] = "static"
model.config.update(static_q_dict)
model.save_pretrained(save_dir)
tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
tokenizer.save_pretrained(save_dir)
def get_layers_to_ignore(model, ignore_patterns) -> List[str]:
ignored_layers = set()
for name, linear in model.named_modules():
if not isinstance(linear, torch.nn.Linear):
continue
for ignore_pattern in ignore_patterns:
regex_prefix = "re:"
if ignore_pattern.startswith(regex_prefix):
# check if name matches regex and add to set if true
regex_pattern = ignore_pattern[len(regex_prefix) :]
if re.search(regex_pattern, name):
ignored_layers.add(name)
else:
# else, exact match
if ignore_pattern == name:
ignored_layers.add(name)
return list(ignored_layers)
def get_kv_cache_quant_layers(model, kv_cache_quant_targets: Tuple[str]) -> List[str]:
kv_cache_quant_layers = []
for name, linear in model.named_modules():
if not isinstance(linear, torch.nn.Linear):
continue
for output_quant_target in kv_cache_quant_targets:
if name.endswith(output_quant_target):
kv_cache_quant_layers.append(name)
return kv_cache_quant_layers
def quantize_to_fp8_dynamic(input_model_dir: str, output_model_dir: str) -> None:
# Define quantization config with static activation scales
quantize_config = BaseQuantizeConfig(
quant_method="fp8", activation_scheme="dynamic"
)
# Load the model, quantize, and save checkpoint
model = AutoFP8ForCausalLM.from_pretrained(input_model_dir, quantize_config)
# No examples for dynamic quantization
model.quantize([])
model.save_quantized(output_model_dir)
if __name__ == "__main__":
quantize_to_fp8_dynamic(sys.argv[0], sys.argv[1])

View File

@ -1,109 +1,16 @@
from typing import Tuple
import psutil
from PySide6.QtCore import QTimer
from PySide6.QtGui import Qt
from PySide6.QtWidgets import QFileDialog, QLabel
from Localizations import * from Localizations import *
from error_handling import show_error import psutil
def resize_window(self, larger) -> None: def update_model_info(logger, self, model_info):
factor = 1.1 if larger else 1 / 1.1
current_width = self.width()
current_height = self.height()
new_width = int(current_width * factor)
new_height = int(current_height * factor)
self.resize(new_width, new_height)
def reset_size(self) -> None:
self.resize(self.default_width, self.default_height)
def parse_resolution(self) -> Tuple[int, int]:
res = os.environ.get("AUTOGGUF_RESOLUTION", "1650x1100")
try:
width, height = map(int, res.split("x"))
if width <= 0 or height <= 0:
raise ValueError
return width, height
except (ValueError, AttributeError):
return 1650, 1100
def browse_base_model(self) -> None:
self.logger.info(BROWSING_FOR_BASE_MODEL_FOLDER) # Updated log message
base_model_folder = QFileDialog.getExistingDirectory(self, SELECT_BASE_MODEL_FOLDER)
if base_model_folder:
self.base_model_path.setText(os.path.abspath(base_model_folder))
def browse_hf_model_input(self) -> None:
self.logger.info(BROWSE_FOR_HF_MODEL_DIRECTORY)
model_dir = QFileDialog.getExistingDirectory(self, SELECT_HF_MODEL_DIRECTORY)
if model_dir:
self.hf_model_input.setText(os.path.abspath(model_dir))
def browse_hf_outfile(self) -> None:
self.logger.info(BROWSE_FOR_HF_TO_GGUF_OUTPUT)
outfile, _ = QFileDialog.getSaveFileName(self, SELECT_OUTPUT_FILE, "", GGUF_FILES)
if outfile:
self.hf_outfile.setText(os.path.abspath(outfile))
def browse_imatrix_datafile(self) -> None:
self.logger.info(BROWSING_FOR_IMATRIX_DATA_FILE)
datafile, _ = QFileDialog.getOpenFileName(self, SELECT_DATA_FILE, "", ALL_FILES)
if datafile:
self.imatrix_datafile.setText(os.path.abspath(datafile))
def browse_imatrix_model(self) -> None:
self.logger.info(BROWSING_FOR_IMATRIX_MODEL_FILE)
model_file, _ = QFileDialog.getOpenFileName(self, SELECT_MODEL_FILE, "", GGUF_FILES)
if model_file:
self.imatrix_model.setText(os.path.abspath(model_file))
def browse_imatrix_output(self) -> None:
self.logger.info(BROWSING_FOR_IMATRIX_OUTPUT_FILE)
output_file, _ = QFileDialog.getSaveFileName(
self, SELECT_OUTPUT_FILE, "", DAT_FILES
)
if output_file:
self.imatrix_output.setText(os.path.abspath(output_file))
def create_label(self, text, tooltip) -> QLabel:
label = QLabel(text)
label.setToolTip(tooltip)
return label
def toggle_gpu_offload_auto(self, state) -> None:
is_auto = state == Qt.CheckState.Checked
self.gpu_offload_slider.setEnabled(not is_auto)
self.gpu_offload_spinbox.setEnabled(not is_auto)
def update_model_info(logger, model_info) -> None:
logger.debug(UPDATING_MODEL_INFO.format(model_info)) logger.debug(UPDATING_MODEL_INFO.format(model_info))
pass pass
def update_system_info(self) -> None: def update_system_info(self):
ram = psutil.virtual_memory() ram = psutil.virtual_memory()
cpu = psutil.cpu_percent() cpu = psutil.cpu_percent()
self.ram_bar.setValue(int(ram.percent))
# Smooth transition for RAM bar
animate_bar(self, self.ram_bar, ram.percent)
# Smooth transition for CPU bar
animate_bar(self, self.cpu_bar, cpu)
self.ram_bar.setFormat( self.ram_bar.setFormat(
RAM_USAGE_FORMAT.format( RAM_USAGE_FORMAT.format(
ram.percent, ram.used // 1024 // 1024, ram.total // 1024 // 1024 ram.percent, ram.used // 1024 // 1024, ram.total // 1024 // 1024
@ -111,47 +18,12 @@ def update_system_info(self) -> None:
) )
self.cpu_label.setText(CPU_USAGE_FORMAT.format(cpu)) self.cpu_label.setText(CPU_USAGE_FORMAT.format(cpu))
# Collect CPU and RAM usage data
self.cpu_data.append(cpu)
self.ram_data.append(ram.percent)
if len(self.cpu_data) > 60: def update_download_progress(self, progress):
self.cpu_data.pop(0)
self.ram_data.pop(0)
def animate_bar(self, bar, target_value) -> None:
current_value = bar.value()
difference = target_value - current_value
if abs(difference) <= 1: # Avoid animation for small changes
bar.setValue(target_value)
return
step = 1 if difference > 0 else -1 # Increment or decrement based on difference
timer = QTimer(self)
timer.timeout.connect(lambda: _animate_step(bar, target_value, step, timer))
timer.start(10) # Adjust the interval for animation speed
def _animate_step(bar, target_value, step, timer) -> None:
current_value = bar.value()
new_value = current_value + step
if (step > 0 and new_value > target_value) or (
step < 0 and new_value < target_value
):
bar.setValue(target_value)
timer.stop()
else:
bar.setValue(new_value)
def update_download_progress(self, progress) -> None:
self.download_progress.setValue(progress) self.download_progress.setValue(progress)
def update_cuda_backends(self) -> None: def update_cuda_backends(self):
self.logger.debug(UPDATING_CUDA_BACKENDS) self.logger.debug(UPDATING_CUDA_BACKENDS)
self.backend_combo_cuda.clear() self.backend_combo_cuda.clear()
llama_bin = os.path.abspath("llama_bin") llama_bin = os.path.abspath("llama_bin")
@ -159,9 +31,7 @@ def update_cuda_backends(self) -> None:
for item in os.listdir(llama_bin): for item in os.listdir(llama_bin):
item_path = os.path.join(llama_bin, item) item_path = os.path.join(llama_bin, item)
if os.path.isdir(item_path) and "cudart-llama" not in item.lower(): if os.path.isdir(item_path) and "cudart-llama" not in item.lower():
if ( if "cu1" in item.lower(): # Only include CUDA-capable backends
"cu1" in item.lower() or "cuda-1" in item.lower()
): # Only include CUDA-capable backends
self.backend_combo_cuda.addItem(item, userData=item_path) self.backend_combo_cuda.addItem(item, userData=item_path)
if self.backend_combo_cuda.count() == 0: if self.backend_combo_cuda.count() == 0:
@ -171,23 +41,23 @@ def update_cuda_backends(self) -> None:
self.backend_combo_cuda.setEnabled(True) self.backend_combo_cuda.setEnabled(True)
def update_threads_spinbox(self, value) -> None: def update_threads_spinbox(self, value):
self.threads_spinbox.setValue(value) self.threads_spinbox.setValue(value)
def update_threads_slider(self, value) -> None: def update_threads_slider(self, value):
self.threads_slider.setValue(value) self.threads_slider.setValue(value)
def update_gpu_offload_spinbox(self, value) -> None: def update_gpu_offload_spinbox(self, value):
self.gpu_offload_spinbox.setValue(value) self.gpu_offload_spinbox.setValue(value)
def update_gpu_offload_slider(self, value) -> None: def update_gpu_offload_slider(self, value):
self.gpu_offload_slider.setValue(value) self.gpu_offload_slider.setValue(value)
def update_cuda_option(self) -> None: def update_cuda_option(self):
self.logger.debug(UPDATING_CUDA_OPTIONS) self.logger.debug(UPDATING_CUDA_OPTIONS)
asset = self.asset_combo.currentData() asset = self.asset_combo.currentData()
@ -207,7 +77,7 @@ def update_cuda_option(self) -> None:
self.update_cuda_backends() self.update_cuda_backends()
def update_assets(self) -> None: def update_assets(self):
self.logger.debug(UPDATING_ASSET_LIST) self.logger.debug(UPDATING_ASSET_LIST)
self.asset_combo.clear() self.asset_combo.clear()
release = self.release_combo.currentData() release = self.release_combo.currentData()
@ -222,6 +92,6 @@ def update_assets(self) -> None:
self.update_cuda_option() self.update_cuda_option()
def update_base_model_visibility(self, index) -> None: def update_base_model_visibility(self, index):
is_gguf = self.lora_output_type_combo.itemText(index) == "GGUF" is_gguf = self.lora_output_type_combo.itemText(index) == "GGUF"
self.base_model_wrapper.setVisible(is_gguf) self.base_model_wrapper.setVisible(is_gguf)

View File

@ -1,138 +1,14 @@
from typing import Any, Union from PySide6.QtWidgets import QFileDialog
import urllib.request from error_handling import show_error
import urllib.error from Localizations import *
import json import requests
import ssl
import certifi
from PySide6.QtCore import Qt
from PySide6.QtWidgets import QFileDialog, QInputDialog, QMenu
from DownloadThread import DownloadThread from DownloadThread import DownloadThread
from Localizations import * from imports_and_globals import ensure_directory
from error_handling import show_error
from globals import ensure_directory
from KVOverrideEntry import KVOverrideEntry
def show_model_context_menu(self, position): def browse_lora_input(self):
item = self.model_tree.itemAt(position)
if item:
# Child of a sharded model or top-level item without children
if item.parent() is not None or item.childCount() == 0:
menu = QMenu()
rename_action = menu.addAction(RENAME)
delete_action = menu.addAction(DELETE)
action = menu.exec(self.model_tree.viewport().mapToGlobal(position))
if action == rename_action:
self.rename_model(item)
elif action == delete_action:
self.delete_model(item)
def rename_model(self, item):
old_name = item.text(0)
new_name, ok = QInputDialog.getText(self, RENAME, f"New name for {old_name}:")
if ok and new_name:
old_path = os.path.join(self.models_input.text(), old_name)
new_path = os.path.join(self.models_input.text(), new_name)
try:
os.rename(old_path, new_path)
item.setText(0, new_name)
self.logger.info(MODEL_RENAMED_SUCCESSFULLY.format(old_name, new_name))
except Exception as e:
show_error(self.logger, f"Error renaming model: {e}")
def add_kv_override(self, override_string=None) -> None:
entry = KVOverrideEntry()
entry.deleted.connect(self.remove_kv_override)
if override_string:
key, value = override_string.split("=")
type_, val = value.split(":")
entry.key_input.setText(key)
entry.type_combo.setCurrentText(type_)
entry.value_input.setText(val)
self.kv_override_layout.addWidget(entry)
self.kv_override_entries.append(entry)
def remove_kv_override(self, entry) -> None:
self.kv_override_layout.removeWidget(entry)
self.kv_override_entries.remove(entry)
entry.deleteLater()
def get_models_data(self) -> list[dict[str, Union[str, Any]]]:
models = []
root = self.model_tree.invisibleRootItem()
child_count = root.childCount()
for i in range(child_count):
item = root.child(i)
model_name = item.text(0)
model_type = "sharded" if "sharded" in model_name.lower() else "single"
model_path = item.data(0, Qt.ItemDataRole.UserRole)
models.append({"name": model_name, "type": model_type, "path": model_path})
return models
def get_tasks_data(self) -> list[dict[str, Union[int, Any]]]:
tasks = []
for i in range(self.task_list.count()):
item = self.task_list.item(i)
task_widget = self.task_list.itemWidget(item)
if task_widget:
tasks.append(
{
"name": task_widget.task_name,
"status": task_widget.status,
"progress": (
task_widget.progress_bar.value()
if hasattr(task_widget, "progress_bar")
else 0
),
"log_file": task_widget.log_file,
}
)
return tasks
def browse_models(self) -> None:
self.logger.info(BROWSING_FOR_MODELS_DIRECTORY)
models_path = QFileDialog.getExistingDirectory(self, SELECT_MODELS_DIRECTORY)
if models_path:
self.models_input.setText(os.path.abspath(models_path))
ensure_directory(models_path)
self.load_models()
def browse_output(self) -> None:
self.logger.info(BROWSING_FOR_OUTPUT_DIRECTORY)
output_path = QFileDialog.getExistingDirectory(self, SELECT_OUTPUT_DIRECTORY)
if output_path:
self.output_input.setText(os.path.abspath(output_path))
ensure_directory(output_path)
def browse_logs(self) -> None:
self.logger.info(BROWSING_FOR_LOGS_DIRECTORY)
logs_path = QFileDialog.getExistingDirectory(self, SELECT_LOGS_DIRECTORY)
if logs_path:
self.logs_input.setText(os.path.abspath(logs_path))
ensure_directory(logs_path)
def browse_imatrix(self) -> None:
self.logger.info(BROWSING_FOR_IMATRIX_FILE)
imatrix_file, _ = QFileDialog.getOpenFileName(
self, SELECT_IMATRIX_FILE, "", DAT_FILES
)
if imatrix_file:
self.imatrix.setText(os.path.abspath(imatrix_file))
def browse_lora_input(self) -> None:
self.logger.info(BROWSING_FOR_LORA_INPUT_DIRECTORY) self.logger.info(BROWSING_FOR_LORA_INPUT_DIRECTORY)
lora_input_path = QFileDialog.getExistingDirectory( lora_input_path = QFileDialog.getExistingDirectory(
self, SELECT_LORA_INPUT_DIRECTORY self, SELECT_LORA_INPUT_DIRECTORY
@ -142,7 +18,7 @@ def browse_lora_input(self) -> None:
ensure_directory(lora_input_path) ensure_directory(lora_input_path)
def browse_lora_output(self) -> None: def browse_lora_output(self):
self.logger.info(BROWSING_FOR_LORA_OUTPUT_FILE) self.logger.info(BROWSING_FOR_LORA_OUTPUT_FILE)
lora_output_file, _ = QFileDialog.getSaveFileName( lora_output_file, _ = QFileDialog.getSaveFileName(
self, SELECT_LORA_OUTPUT_FILE, "", GGUF_AND_BIN_FILES self, SELECT_LORA_OUTPUT_FILE, "", GGUF_AND_BIN_FILES
@ -151,7 +27,7 @@ def browse_lora_output(self) -> None:
self.lora_output.setText(os.path.abspath(lora_output_file)) self.lora_output.setText(os.path.abspath(lora_output_file))
def download_llama_cpp(self) -> None: def download_llama_cpp(self):
self.logger.info(STARTING_LLAMACPP_DOWNLOAD) self.logger.info(STARTING_LLAMACPP_DOWNLOAD)
asset = self.asset_combo.currentData() asset = self.asset_combo.currentData()
if not asset: if not asset:
@ -173,47 +49,18 @@ def download_llama_cpp(self) -> None:
self.download_progress.setValue(0) self.download_progress.setValue(0)
def get_repo_from_env() -> tuple[str, str]: def refresh_releases(self):
repo = os.getenv("AUTOGGUF_BACKEND_REPO", "ggerganov/llama.cpp")
if not repo or "/" not in repo:
raise ValueError(INVALID_REPOSITORY_FORMAT)
owner, repo_name = repo.split("/", 1)
if not all(part.strip() for part in (owner, repo_name)):
raise ValueError(REPO_CANNOT_BE_EMPTY)
return owner, repo_name
def refresh_releases(self) -> None:
self.logger.info(REFRESHING_LLAMACPP_RELEASES) self.logger.info(REFRESHING_LLAMACPP_RELEASES)
try: try:
owner, repo = get_repo_from_env() response = requests.get(
url = f"https://api.github.com/repos/{owner}/{repo}/releases" "https://api.github.com/repos/ggerganov/llama.cpp/releases"
# Create SSL context with certifi certificates
ssl_context = ssl.create_default_context(cafile=certifi.where())
# Create request
req = urllib.request.Request(url)
# Make the request
with urllib.request.urlopen(req, context=ssl_context) as response:
if response.status != 200:
raise urllib.error.HTTPError(
url, response.status, "HTTP Error", response.headers, None
) )
response.raise_for_status() # Raise an exception for bad status codes
releases = json.loads(response.read().decode("utf-8")) releases = response.json()
self.release_combo.clear() self.release_combo.clear()
for release in releases: for release in releases:
self.release_combo.addItem(release["tag_name"], userData=release) self.release_combo.addItem(release["tag_name"], userData=release)
self.release_combo.currentIndexChanged.connect(self.update_assets) self.release_combo.currentIndexChanged.connect(self.update_assets)
self.update_assets() self.update_assets()
except requests.exceptions.RequestException as e:
except ValueError as e:
show_error(self.logger, f"Invalid repository configuration: {str(e)}")
except (urllib.error.URLError, urllib.error.HTTPError) as e:
show_error(self.logger, ERROR_FETCHING_RELEASES.format(str(e))) show_error(self.logger, ERROR_FETCHING_RELEASES.format(str(e)))