from accelerate.utils import set_seed
Base Model
from hqq.core.peft import PeftUtils
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForCausalLM
from transformers.utils.quantization_config import BitsAndBytesConfig
from transformers.pytorch_utils import Conv1D
import transformers
from transformers import LlamaConfig, LlamaForCausalLM
from transformers.integrations.bitsandbytes import replace_with_bnb_linear
from transformers.utils.quantization_config import BitsAndBytesConfig
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
from peft.tuners.lora.config import LoraConfig
from peft.mapping import get_peft_model
from peft.utils.peft_types import *
import os
import gc
import inspect
from accelerate.utils import set_seed
from functools import partial
from pathlib import Path
= Path("profile_snapshots/")
save_dir =True) os.makedirs(save_dir, exist_ok
transformers.logging.set_verbosity_warning()
def malloc_in_gb():
return torch.cuda.memory_allocated()/1e9
def get_model_size_config(model_size):
if model_size == "DEBUG":
= dict(hidden_size=128,
model_size_config =2,
num_hidden_layers=2,
num_attention_heads=2,
num_key_value_heads=256)
intermediate_sizeelif model_size == "60M":
= dict(hidden_size=512,
model_size_config =4,
num_hidden_layers=4,
num_attention_heads=4,
num_key_value_heads=1024)
intermediate_sizeelif model_size == "120M":
= dict(hidden_size=768,
model_size_config =12,
num_hidden_layers=12,
num_attention_heads=12,
num_key_value_heads=1536)
intermediate_sizeelif model_size == "290M":
= dict(hidden_size=1024,
model_size_config =12,
num_hidden_layers=16,
num_attention_heads=16,
num_key_value_heads=4096)
intermediate_sizeelif model_size == "1B":
= dict(hidden_size=2048,
model_size_config =24,
num_hidden_layers=16,
num_attention_heads=16,
num_key_value_heads=4096)
intermediate_sizeelif model_size == "7B":
= {}
model_size_config return model_size_config
def create_model(model_size="1B"):
= get_model_size_config(model_size)
model_size_config # download model weights and config files.
= LlamaConfig()
config
config.update(model_size_config)= LlamaForCausalLM(config)
model return model
def free_memory():
gc.collect() torch.cuda.empty_cache()
print(f"Memory allocated: {malloc_in_gb():.3f} GB")
Memory allocated: 0.000 GB
# create dummy inputs
= create_model("DEBUG")
model = model.model.embed_tokens.weight.size(0)
vocab_size = [torch.randint(0, vocab_size, (1, sl)) for sl in [512,1024,2048,3072]]
inputs print(f"Memory allocated: {malloc_in_gb():.3f} GB")
Memory allocated: 0.000 GB
def profile_model(create_model_func, inference=False, save_filename="mem_profile.pickle"):
"""
https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html#demonstrating-speedups
https://pytorch.org/docs/stable/torch_cuda_memory.html
https://medium.com/pytorch/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d
https://pytorch.org/tutorials/intermediate/autograd_saved_tensors_hooks_tutorial.html
"""
42)
set_seed(
torch.cuda.memory._record_memory_history()for x in inputs:
print(f"Input Size:{tuple(x.size())}")
= torch.cuda.Event(enable_timing=True)
start = torch.cuda.Event(enable_timing=True)
end
start.record()if inference:
with torch.no_grad():
= create_model_func()
model "cuda", torch.bfloat16);
model.to(print(f"Memory allocated [MODEL]: {malloc_in_gb():.3f} GB")
= model(x.to("cuda"))
output print(f"Memory allocated [FWD]: {malloc_in_gb():.3f} GB")
else:
= create_model_func()
model "cuda", torch.bfloat16);
model.to(print(f"Memory allocated [MODEL): {malloc_in_gb():.3f} GB")
= model(x.to("cuda"))
output print(f"Memory allocated [FWD]: {malloc_in_gb():.3f} GB")
output.logits.mean().backward()print(f"Memory allocated [BWD]: {malloc_in_gb():.3f} GB")
end.record()
torch.cuda.synchronize()= start.elapsed_time(end) / 1000
secs print(f"Elapsed time: {secs:.3f}\n\n")
= None, None
output, model
free_memory()
torch.cuda.memory._dump_snapshot(save_filename)print(f"Memory allocated [finish]: {malloc_in_gb():.3f} GB")
# warmup
"DEBUG"), inference=True, save_filename=save_dir/"debug-inference.pickle") profile_model(partial(create_model,
Input Size:(1, 512)
Memory allocated [MODEL]: 0.051 GB
Memory allocated [FWD]: 0.125 GB
Elapsed time: 1.338
Input Size:(1, 1024)
Memory allocated [MODEL]: 0.059 GB
Memory allocated [FWD]: 0.193 GB
Elapsed time: 0.142
Input Size:(1, 2048)
Memory allocated [MODEL]: 0.059 GB
Memory allocated [FWD]: 0.324 GB
Elapsed time: 0.135
Input Size:(1, 3072)
Memory allocated [MODEL]: 0.059 GB
Memory allocated [FWD]: 0.425 GB
Elapsed time: 0.201
Memory allocated [finish]: 0.009 GB
"1B"), inference=True, save_filename=save_dir/"base-inference.pickle") profile_model(partial(create_model,
Input Size:(1, 512)
Memory allocated [MODEL]: 2.320 GB
Memory allocated [FWD]: 2.492 GB
Elapsed time: 15.401
Input Size:(1, 1024)
Memory allocated [MODEL]: 2.320 GB
Memory allocated [FWD]: 2.666 GB
Elapsed time: 14.997
Input Size:(1, 2048)
Memory allocated [MODEL]: 2.320 GB
Memory allocated [FWD]: 3.010 GB
Elapsed time: 14.370
Input Size:(1, 3072)
Memory allocated [MODEL]: 2.320 GB
Memory allocated [FWD]: 3.322 GB
Elapsed time: 14.218
Memory allocated [finish]: 0.009 GB
# (1, 4096) OOMs with a 16GB GPU
"1B"), inference=False, save_filename=save_dir/"base-training.pickle") profile_model(partial(create_model,
Input Size:(1, 512)
Memory allocated [MODEL): 2.320 GB
Memory allocated [FWD]: 3.521 GB
Memory allocated [BWD]: 4.779 GB
Elapsed time: 13.765
Input Size:(1, 1024)
Memory allocated [MODEL): 2.328 GB
Memory allocated [FWD]: 4.757 GB
Memory allocated [BWD]: 4.952 GB
Elapsed time: 13.277
Input Size:(1, 2048)
Memory allocated [MODEL): 2.328 GB
Memory allocated [FWD]: 7.283 GB
Memory allocated [BWD]: 5.294 GB
Elapsed time: 13.706
Input Size:(1, 3072)
Memory allocated [MODEL): 2.328 GB
Memory allocated [FWD]: 9.879 GB
Memory allocated [BWD]: 5.606 GB
Elapsed time: 14.203
Memory allocated [finish]: 0.017 GB
LoRA
def create_lora_model(model_size="1B", gc_enabled=False):
= get_model_size_config(model_size)
model_size_config # download model weights and config files.
= LlamaConfig()
config
config.update(model_size_config)= LlamaForCausalLM(config)
model = LoraConfig(
peft_config =TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
task_type
)= get_peft_model(model, peft_config)
model if gc_enabled: model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
return model
"1B"), inference=True, save_filename=save_dir/"lora-inference.pickle") profile_model(partial(create_lora_model,
Input Size:(1, 512)
Memory allocated [MODEL]: 2.314 GB
Memory allocated [FWD]: 2.495 GB
Elapsed time: 17.398
Input Size:(1, 1024)
Memory allocated [MODEL]: 2.323 GB
Memory allocated [FWD]: 2.669 GB
Elapsed time: 15.746
Input Size:(1, 2048)
Memory allocated [MODEL]: 2.323 GB
Memory allocated [FWD]: 3.013 GB
Elapsed time: 15.481
Input Size:(1, 3072)
Memory allocated [MODEL]: 2.323 GB
Memory allocated [FWD]: 3.325 GB
Elapsed time: 15.432
Memory allocated [finish]: 0.009 GB
"1B"), inference=False, save_filename=save_dir/"lora-training.pickle") profile_model(partial(create_lora_model,
Input Size:(1, 512)
Memory allocated [MODEL): 2.323 GB
Memory allocated [FWD]: 3.363 GB
Memory allocated [BWD]: 2.507 GB
Elapsed time: 16.125
Input Size:(1, 1024)
Memory allocated [MODEL): 2.331 GB
Memory allocated [FWD]: 4.437 GB
Memory allocated [BWD]: 2.681 GB
Elapsed time: 15.417
Input Size:(1, 2048)
Memory allocated [MODEL): 2.331 GB
Memory allocated [FWD]: 6.642 GB
Memory allocated [BWD]: 3.025 GB
Elapsed time: 15.374
Input Size:(1, 3072)
Memory allocated [MODEL): 2.331 GB
Memory allocated [FWD]: 8.916 GB
Memory allocated [BWD]: 3.337 GB
Elapsed time: 15.821
Memory allocated [finish]: 0.017 GB
LoRA + Gradient Ckpt.
Using default HF grad ckpt strategy which wraps each individual decoder layers.
"1B", gc_enabled=True), inference=False, save_filename=save_dir/"lora-gc-training.pickle") profile_model(partial(create_lora_model,
Input Size:(1, 512)
Memory allocated [MODEL): 2.331 GB
Memory allocated [FWD]: 2.466 GB
Memory allocated [BWD]: 2.406 GB
Elapsed time: 15.596
Input Size:(1, 1024)
Memory allocated [MODEL): 2.331 GB
Memory allocated [FWD]: 2.594 GB
Memory allocated [BWD]: 2.479 GB
Elapsed time: 14.345
Input Size:(1, 2048)
Memory allocated [MODEL): 2.331 GB
Memory allocated [FWD]: 2.845 GB
Memory allocated [BWD]: 2.622 GB
Elapsed time: 14.974
Input Size:(1, 3072)
Memory allocated [MODEL): 2.331 GB
Memory allocated [FWD]: 3.091 GB
Memory allocated [BWD]: 2.733 GB
Elapsed time: 15.887
Memory allocated [finish]: 0.017 GB
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
HQQ LoRA
from hqq.core.quantize import BaseQuantizeConfig, HQQLinear, HQQBackend
from hqq.models.hf.llama import LlamaHQQ
def replace_with_hqq_linear(
model,=None,
modules_to_not_convert=None,
current_key_name=None,
quantization_config=False,
has_been_replaced=torch.uint8,
quant_storage=torch.bfloat16,
compute_dtype=False,
keep_trainable
):"""
Private method that wraps the recursion for module replacement.
Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
"""
for name, module in model.named_children():
if current_key_name is None:
= []
current_key_name
current_key_name.append(name)
if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert:
# Check if the current key is not in the `modules_to_not_convert`
if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
# with init_empty_weights():
= HQQLinear(module,
model._modules[name]
quantization_config, =True,
del_orig=compute_dtype,
compute_dtype=torch.cuda.current_device())
device_n= True
has_been_replaced # Store the module class in case we need to transpose the weight later
= type(module)
model._modules[name].source_cls # Force requires grad to False to avoid unexpected errors
if keep_trainable:
True)
model._modules[name].requires_grad_(else:
False)
model._modules[name].requires_grad_(if len(list(module.children())) > 0:
= replace_with_hqq_linear(
_, has_been_replaced
module,
modules_to_not_convert,
current_key_name,
quantization_config,=has_been_replaced
has_been_replaced
)# Remove the last key for recursion
-1)
current_key_name.pop(return model, has_been_replaced
def create_qlora_model(model_size="1B", with_lora=True, gc_enabled=False, keep_trainable=False, backend=HQQBackend.ATEN):
= get_model_size_config(model_size)
model_size_config
# download model weights and config files.
= LlamaConfig()
config
config.update(model_size_config)= LlamaForCausalLM(config)
model
= BaseQuantizeConfig(nbits=4, group_size=64, quant_zero=True, quant_scale=False)
quant_config = replace_with_hqq_linear(model,
model, has_been_replaced =["lm_head"],
modules_to_not_convert=quant_config,
quantization_config=keep_trainable,
keep_trainable=torch.bfloat16,
quant_storage=torch.bfloat16)
compute_dtype
assert has_been_replaced
if with_lora:
= {'lora_type':'default',
base_lora_params 'r':8,
'lora_alpha':32,
'dropout':0.1,
'compute_dtype':torch.bfloat16,
'train_dtype':torch.bfloat16}
= {'self_attn.q_proj': base_lora_params,
lora_params 'self_attn.k_proj': base_lora_params,
'self_attn.v_proj': base_lora_params,
'self_attn.o_proj': base_lora_params,
'mlp.gate_proj' : base_lora_params,
'mlp.up_proj' : base_lora_params,
'mlp.down_proj' : base_lora_params}
=LlamaHQQ, verbose=True)
PeftUtils.add_lora(model, lora_params, base_classif gc_enabled: model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
HQQLinear.set_backend(backend)return model
# set_seed(42)
# model = create_qlora_model(model_size="DEBUG", with_lora=True,
# gc_enabled=False, keep_trainable=False, backend=HQQBackend.PYTORCH_BACKPROP_COMPILE)
# model.to(0).to(torch.bfloat16);
# x = torch.randint(0,100,(4, 128)).cuda()#.to(torch.bfloat16)
# o = model(x)
# loss = o.logits.mean()
# loss.backward()
# for n,p in model.named_parameters():
# if p.requires_grad:
# print(n, p.dtype, p.device, p.grad.norm().data)
"1B", backend=HQQBackend.ATEN), inference=True, save_filename=save_dir/"qlora-inference.pickle") profile_model(partial(create_qlora_model,
Input Size:(1, 512)
Memory allocated [MODEL]: 0.862 GB
Memory allocated [FWD]: 1.043 GB
Elapsed time: 66.540
Input Size:(1, 1024)
Memory allocated [MODEL]: 0.871 GB
Memory allocated [FWD]: 1.217 GB
Elapsed time: 65.790
Input Size:(1, 2048)
Memory allocated [MODEL]: 0.871 GB
Memory allocated [FWD]: 1.561 GB
Elapsed time: 65.778
Input Size:(1, 3072)
Memory allocated [MODEL]: 0.871 GB
Memory allocated [FWD]: 1.873 GB
Elapsed time: 65.310
Memory allocated [finish]: 0.009 GB
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 197.56it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 195.93it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 203.04it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 212.20it/s]
"1B", backend=HQQBackend.ATEN_BACKPROP), inference=False, save_filename=save_dir/"qlora-training.pickle") profile_model(partial(create_qlora_model,
Input Size:(1, 512)
Memory allocated [MODEL): 0.871 GB
Memory allocated [FWD]: 2.563 GB
Memory allocated [BWD]: 1.065 GB
Elapsed time: 65.322
Input Size:(1, 1024)
Memory allocated [MODEL): 0.879 GB
Memory allocated [FWD]: 4.289 GB
Memory allocated [BWD]: 1.238 GB
Elapsed time: 64.854
Input Size:(1, 2048)
Memory allocated [MODEL): 0.879 GB
Memory allocated [FWD]: 7.798 GB
Memory allocated [BWD]: 1.582 GB
Elapsed time: 64.948
Input Size:(1, 3072)
Memory allocated [MODEL): 0.879 GB
Memory allocated [FWD]: 11.376 GB
Memory allocated [BWD]: 1.895 GB
Elapsed time: 65.371
Memory allocated [finish]: 0.017 GB
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 217.77it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 208.45it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 207.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 207.45it/s]
QLORA + Gradient Ckpt.
Using default HF grad ckpt strategy which wraps each individual decoder layer.
"DEBUG", gc_enabled=True, backend=HQQBackend.PYTORCH_BACKPROP),
profile_model(partial(create_qlora_model, =False, save_filename=save_dir/"qlora-gc-training.pickle") inference
# for n,p in model.named_parameters():
# print(n, p.name, p.requires_grad)
= create_qlora_model("DEBUG", gc_enabled=True, backend=HQQBackend.PYTORCH_BACKPROP) model
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 307.93it/s]
"cuda", torch.bfloat16); model.to(
This is the correct timing, because this excludes model initialization and quantization.
for x in inputs:
42)
set_seed(= create_qlora_model("1B", gc_enabled=True, backend=HQQBackend.ATEN_BACKPROP)
model "cuda", torch.bfloat16);
model.to(with torch.no_grad():
0].to("cuda"))
model(inputs[
= torch.cuda.Event(enable_timing=True)
start = torch.cuda.Event(enable_timing=True)
end
start.record()
torch.cuda.reset_peak_memory_stats()print(f"Memory allocated [MODEL): {malloc_in_gb():.3f} GB")
= model(x.to("cuda"))
output print(f"Memory allocated [FWD]: {malloc_in_gb():.3f} GB")
output.logits.mean().backward()print(f"Memory allocated [BWD]: {malloc_in_gb():.3f} GB")
= torch.cuda.memory.max_memory_allocated()/1e9
max_memory print(f"Max MemAlloc: {max_memory}")
end.record()
torch.cuda.synchronize()= start.elapsed_time(end) / 1000
secs print(f"Elapsed time: {secs:.3f}\n\n")
= None, None
output, model free_memory()
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 193.32it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 196.59it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 197.02it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 138.58it/s]
Memory allocated [MODEL): 0.964 GB
Memory allocated [FWD]: 1.092 GB
Memory allocated [BWD]: 1.043 GB
Max MemAlloc: 1.190423552
Elapsed time: 0.402
Memory allocated [MODEL): 0.964 GB
Memory allocated [FWD]: 1.220 GB
Memory allocated [BWD]: 1.115 GB
Max MemAlloc: 1.417184256
Elapsed time: 0.401
Memory allocated [MODEL): 0.964 GB
Memory allocated [FWD]: 1.471 GB
Memory allocated [BWD]: 1.258 GB
Max MemAlloc: 1.865462784
Elapsed time: 0.411
Memory allocated [MODEL): 0.964 GB
Memory allocated [FWD]: 1.717 GB
Memory allocated [BWD]: 1.369 GB
Max MemAlloc: 2.307974144
Elapsed time: 0.500