DeepSeek-V3/inference/fp8_cast_bf16.py

import os
import json
from argparse import ArgumentParser
from glob import glob
from tqdm import tqdm

import torch
from safetensors.torch import load_file, save_file

from kernel import weight_dequant

def main(fp8_path, bf16_path):
    """
    Converts FP8 weights to BF16 and saves the converted weights.

    This function reads FP8 weights from the specified directory, converts them to BF16,
    and saves the converted weights to another specified directory. It also updates the
    model index file to reflect the changes.

    Args:
    fp8_path (str): The path to the directory containing the FP8 weights and model index file.
    bf16_path (str): The path to the directory where the converted BF16 weights will be saved.

    Raises:
    KeyError: If a required scale_inv tensor is missing for a weight.

    Notes:
    - The function assumes that the FP8 weights are stored in safetensor files.
    - The function caches loaded safetensor files to optimize memory usage.
    - The function updates the model index file to remove references to scale_inv tensors.
    """
    torch.set_default_dtype(torch.bfloat16)
    os.makedirs(bf16_path, exist_ok=True)
    model_index_file = os.path.join(fp8_path, "model.safetensors.index.json")
    with open(model_index_file, "r") as f:
        model_index = json.load(f)
    weight_map = model_index["weight_map"]
    
    # Cache for loaded safetensor files
    loaded_files = {}
    fp8_weight_names = []

    # Helper function to get tensor from the correct file
    def get_tensor(tensor_name):
        """
        Retrieves a tensor from the cached safetensor files or loads it from disk if not cached.

        Args:
            tensor_name (str): The name of the tensor to retrieve.

        Returns:
            torch.Tensor: The retrieved tensor.

        Raises:
            KeyError: If the tensor does not exist in the safetensor file.
        """
        file_name = weight_map[tensor_name]
        if file_name not in loaded_files:
            file_path = os.path.join(fp8_path, file_name)
            loaded_files[file_name] = load_file(file_path, device="cuda")
        return loaded_files[file_name][tensor_name]

    safetensor_files = list(glob(os.path.join(fp8_path, "*.safetensors")))
    safetensor_files.sort()
    for safetensor_file in tqdm(safetensor_files):
        file_name = os.path.basename(safetensor_file)
        current_state_dict = load_file(safetensor_file, device="cuda")
        loaded_files[file_name] = current_state_dict
        
        new_state_dict = {}
        for weight_name, weight in current_state_dict.items():
            if weight_name.endswith("_scale_inv"):
                continue
            elif weight.element_size() == 1:  # FP8 weight
                scale_inv_name = f"{weight_name}_scale_inv"
                try:
                    # Get scale_inv from the correct file
                    scale_inv = get_tensor(scale_inv_name)
                    fp8_weight_names.append(weight_name)
                    new_state_dict[weight_name] = weight_dequant(weight, scale_inv)
                except KeyError:
                    print(f"Warning: Missing scale_inv tensor for {weight_name}, skipping conversion")
                    new_state_dict[weight_name] = weight
            else:
                new_state_dict[weight_name] = weight
                
        new_safetensor_file = os.path.join(bf16_path, file_name)
        save_file(new_state_dict, new_safetensor_file)
        
        # Memory management: keep only the 2 most recently used files
        if len(loaded_files) > 2:
            oldest_file = next(iter(loaded_files))
            del loaded_files[oldest_file]
            torch.cuda.empty_cache()
    
    # Update model index
    new_model_index_file = os.path.join(bf16_path, "model.safetensors.index.json")
    for weight_name in fp8_weight_names:
        scale_inv_name = f"{weight_name}_scale_inv"
        if scale_inv_name in weight_map:
            weight_map.pop(scale_inv_name)
    with open(new_model_index_file, "w") as f:
        json.dump({"metadata": {}, "weight_map": weight_map}, f, indent=2)
        

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("--input-fp8-hf-path", type=str, required=True)
    parser.add_argument("--output-bf16-hf-path", type=str, required=True)
    args = parser.parse_args()
    main(args.input_fp8_hf_path, args.output_bf16_hf_path)
Release DeepSeek-V3 2024-12-26 11:01:57 +00:00			`import os`
			`import json`
			`from argparse import ArgumentParser`
			`from glob import glob`
			`from tqdm import tqdm`

			`import torch`
			`from safetensors.torch import load_file, save_file`

			`from kernel import weight_dequant`

			`def main(fp8_path, bf16_path):`
Enhance documentation and update .gitignore for model conversion scripts 2025-01-05 18:18:18 +00:00			`"""`
			`Converts FP8 weights to BF16 and saves the converted weights.`

			`This function reads FP8 weights from the specified directory, converts them to BF16,`
			`and saves the converted weights to another specified directory. It also updates the`
			`model index file to reflect the changes.`

			`Args:`
			`fp8_path (str): The path to the directory containing the FP8 weights and model index file.`
			`bf16_path (str): The path to the directory where the converted BF16 weights will be saved.`

			`Raises:`
			`KeyError: If a required scale_inv tensor is missing for a weight.`

			`Notes:`
			`- The function assumes that the FP8 weights are stored in safetensor files.`
			`- The function caches loaded safetensor files to optimize memory usage.`
			`- The function updates the model index file to remove references to scale_inv tensors.`
			`"""`
Release DeepSeek-V3 2024-12-26 11:01:57 +00:00			`torch.set_default_dtype(torch.bfloat16)`
			`os.makedirs(bf16_path, exist_ok=True)`
			`model_index_file = os.path.join(fp8_path, "model.safetensors.index.json")`
			`with open(model_index_file, "r") as f:`
			`model_index = json.load(f)`
			`weight_map = model_index["weight_map"]`

handle missing scale_inv_name (#2) * handle missing scale_inv_name Fixed an issue where `weight` and `weight_scale_inv` (e.g. `model.layers.39.mlp.experts.92.gate_proj.weight` and `model.layers.39.mlp.experts.92.gate_proj.weight_scale_inv`) were not in the same SafeTensor, causing an assertion error due to scale_inv_name not being in the state_dict. * sort filename to reduce memory costs * Add CUDA cache clearing in memory management Added torch.cuda.empty_cache() to free up unused memory on the GPU, 2024-12-27 01:34:38 +00:00			`# Cache for loaded safetensor files`
			`loaded_files = {}`
			`fp8_weight_names = []`

			`# Helper function to get tensor from the correct file`
			`def get_tensor(tensor_name):`
Enhance documentation and update .gitignore for model conversion scripts 2025-01-05 18:18:18 +00:00			`"""`
			`Retrieves a tensor from the cached safetensor files or loads it from disk if not cached.`

			`Args:`
			`tensor_name (str): The name of the tensor to retrieve.`

			`Returns:`
			`torch.Tensor: The retrieved tensor.`

			`Raises:`
			`KeyError: If the tensor does not exist in the safetensor file.`
			`"""`
handle missing scale_inv_name (#2) * handle missing scale_inv_name Fixed an issue where `weight` and `weight_scale_inv` (e.g. `model.layers.39.mlp.experts.92.gate_proj.weight` and `model.layers.39.mlp.experts.92.gate_proj.weight_scale_inv`) were not in the same SafeTensor, causing an assertion error due to scale_inv_name not being in the state_dict. * sort filename to reduce memory costs * Add CUDA cache clearing in memory management Added torch.cuda.empty_cache() to free up unused memory on the GPU, 2024-12-27 01:34:38 +00:00			`file_name = weight_map[tensor_name]`
			`if file_name not in loaded_files:`
			`file_path = os.path.join(fp8_path, file_name)`
			`loaded_files[file_name] = load_file(file_path, device="cuda")`
			`return loaded_files[file_name][tensor_name]`

Release DeepSeek-V3 2024-12-26 11:01:57 +00:00			`safetensor_files = list(glob(os.path.join(fp8_path, "*.safetensors")))`
handle missing scale_inv_name (#2) * handle missing scale_inv_name Fixed an issue where `weight` and `weight_scale_inv` (e.g. `model.layers.39.mlp.experts.92.gate_proj.weight` and `model.layers.39.mlp.experts.92.gate_proj.weight_scale_inv`) were not in the same SafeTensor, causing an assertion error due to scale_inv_name not being in the state_dict. * sort filename to reduce memory costs * Add CUDA cache clearing in memory management Added torch.cuda.empty_cache() to free up unused memory on the GPU, 2024-12-27 01:34:38 +00:00			`safetensor_files.sort()`
Release DeepSeek-V3 2024-12-26 11:01:57 +00:00			`for safetensor_file in tqdm(safetensor_files):`
			`file_name = os.path.basename(safetensor_file)`
handle missing scale_inv_name (#2) * handle missing scale_inv_name Fixed an issue where `weight` and `weight_scale_inv` (e.g. `model.layers.39.mlp.experts.92.gate_proj.weight` and `model.layers.39.mlp.experts.92.gate_proj.weight_scale_inv`) were not in the same SafeTensor, causing an assertion error due to scale_inv_name not being in the state_dict. * sort filename to reduce memory costs * Add CUDA cache clearing in memory management Added torch.cuda.empty_cache() to free up unused memory on the GPU, 2024-12-27 01:34:38 +00:00			`current_state_dict = load_file(safetensor_file, device="cuda")`
			`loaded_files[file_name] = current_state_dict`

Release DeepSeek-V3 2024-12-26 11:01:57 +00:00			`new_state_dict = {}`
handle missing scale_inv_name (#2) * handle missing scale_inv_name Fixed an issue where `weight` and `weight_scale_inv` (e.g. `model.layers.39.mlp.experts.92.gate_proj.weight` and `model.layers.39.mlp.experts.92.gate_proj.weight_scale_inv`) were not in the same SafeTensor, causing an assertion error due to scale_inv_name not being in the state_dict. * sort filename to reduce memory costs * Add CUDA cache clearing in memory management Added torch.cuda.empty_cache() to free up unused memory on the GPU, 2024-12-27 01:34:38 +00:00			`for weight_name, weight in current_state_dict.items():`
Release DeepSeek-V3 2024-12-26 11:01:57 +00:00			`if weight_name.endswith("_scale_inv"):`
			`continue`
handle missing scale_inv_name (#2) * handle missing scale_inv_name Fixed an issue where `weight` and `weight_scale_inv` (e.g. `model.layers.39.mlp.experts.92.gate_proj.weight` and `model.layers.39.mlp.experts.92.gate_proj.weight_scale_inv`) were not in the same SafeTensor, causing an assertion error due to scale_inv_name not being in the state_dict. * sort filename to reduce memory costs * Add CUDA cache clearing in memory management Added torch.cuda.empty_cache() to free up unused memory on the GPU, 2024-12-27 01:34:38 +00:00			`elif weight.element_size() == 1: # FP8 weight`
Release DeepSeek-V3 2024-12-26 11:01:57 +00:00			`scale_inv_name = f"{weight_name}_scale_inv"`
handle missing scale_inv_name (#2) * handle missing scale_inv_name Fixed an issue where `weight` and `weight_scale_inv` (e.g. `model.layers.39.mlp.experts.92.gate_proj.weight` and `model.layers.39.mlp.experts.92.gate_proj.weight_scale_inv`) were not in the same SafeTensor, causing an assertion error due to scale_inv_name not being in the state_dict. * sort filename to reduce memory costs * Add CUDA cache clearing in memory management Added torch.cuda.empty_cache() to free up unused memory on the GPU, 2024-12-27 01:34:38 +00:00			`try:`
			`# Get scale_inv from the correct file`
			`scale_inv = get_tensor(scale_inv_name)`
			`fp8_weight_names.append(weight_name)`
			`new_state_dict[weight_name] = weight_dequant(weight, scale_inv)`
			`except KeyError:`
			`print(f"Warning: Missing scale_inv tensor for {weight_name}, skipping conversion")`
			`new_state_dict[weight_name] = weight`
Release DeepSeek-V3 2024-12-26 11:01:57 +00:00			`else:`
			`new_state_dict[weight_name] = weight`
handle missing scale_inv_name (#2) * handle missing scale_inv_name Fixed an issue where `weight` and `weight_scale_inv` (e.g. `model.layers.39.mlp.experts.92.gate_proj.weight` and `model.layers.39.mlp.experts.92.gate_proj.weight_scale_inv`) were not in the same SafeTensor, causing an assertion error due to scale_inv_name not being in the state_dict. * sort filename to reduce memory costs * Add CUDA cache clearing in memory management Added torch.cuda.empty_cache() to free up unused memory on the GPU, 2024-12-27 01:34:38 +00:00
Release DeepSeek-V3 2024-12-26 11:01:57 +00:00			`new_safetensor_file = os.path.join(bf16_path, file_name)`
			`save_file(new_state_dict, new_safetensor_file)`
handle missing scale_inv_name (#2) * handle missing scale_inv_name Fixed an issue where `weight` and `weight_scale_inv` (e.g. `model.layers.39.mlp.experts.92.gate_proj.weight` and `model.layers.39.mlp.experts.92.gate_proj.weight_scale_inv`) were not in the same SafeTensor, causing an assertion error due to scale_inv_name not being in the state_dict. * sort filename to reduce memory costs * Add CUDA cache clearing in memory management Added torch.cuda.empty_cache() to free up unused memory on the GPU, 2024-12-27 01:34:38 +00:00
			`# Memory management: keep only the 2 most recently used files`
			`if len(loaded_files) > 2:`
			`oldest_file = next(iter(loaded_files))`
			`del loaded_files[oldest_file]`
			`torch.cuda.empty_cache()`
Release DeepSeek-V3 2024-12-26 11:01:57 +00:00
handle missing scale_inv_name (#2) * handle missing scale_inv_name Fixed an issue where `weight` and `weight_scale_inv` (e.g. `model.layers.39.mlp.experts.92.gate_proj.weight` and `model.layers.39.mlp.experts.92.gate_proj.weight_scale_inv`) were not in the same SafeTensor, causing an assertion error due to scale_inv_name not being in the state_dict. * sort filename to reduce memory costs * Add CUDA cache clearing in memory management Added torch.cuda.empty_cache() to free up unused memory on the GPU, 2024-12-27 01:34:38 +00:00			`# Update model index`
Release DeepSeek-V3 2024-12-26 11:01:57 +00:00			`new_model_index_file = os.path.join(bf16_path, "model.safetensors.index.json")`
			`for weight_name in fp8_weight_names:`
			`scale_inv_name = f"{weight_name}_scale_inv"`
handle missing scale_inv_name (#2) * handle missing scale_inv_name Fixed an issue where `weight` and `weight_scale_inv` (e.g. `model.layers.39.mlp.experts.92.gate_proj.weight` and `model.layers.39.mlp.experts.92.gate_proj.weight_scale_inv`) were not in the same SafeTensor, causing an assertion error due to scale_inv_name not being in the state_dict. * sort filename to reduce memory costs * Add CUDA cache clearing in memory management Added torch.cuda.empty_cache() to free up unused memory on the GPU, 2024-12-27 01:34:38 +00:00			`if scale_inv_name in weight_map:`
			`weight_map.pop(scale_inv_name)`
Release DeepSeek-V3 2024-12-26 11:01:57 +00:00			`with open(new_model_index_file, "w") as f:`
			`json.dump({"metadata": {}, "weight_map": weight_map}, f, indent=2)`


			`if __name__ == "__main__":`
			`parser = ArgumentParser()`
			`parser.add_argument("--input-fp8-hf-path", type=str, required=True)`
			`parser.add_argument("--output-bf16-hf-path", type=str, required=True)`
			`args = parser.parse_args()`
			`main(args.input_fp8_hf_path, args.output_bf16_hf_path)`
handle missing scale_inv_name (#2) * handle missing scale_inv_name Fixed an issue where `weight` and `weight_scale_inv` (e.g. `model.layers.39.mlp.experts.92.gate_proj.weight` and `model.layers.39.mlp.experts.92.gate_proj.weight_scale_inv`) were not in the same SafeTensor, causing an assertion error due to scale_inv_name not being in the state_dict. * sort filename to reduce memory costs * Add CUDA cache clearing in memory management Added torch.cuda.empty_cache() to free up unused memory on the GPU, 2024-12-27 01:34:38 +00:00