import bitsandbytes as bnb
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForCausalLM
from transformers.utils.quantization_config import BitsAndBytesConfig
from transformers.pytorch_utils import Conv1D
import transformers
from transformers import LlamaConfig, LlamaForCausalLM
from transformers.integrations.bitsandbytes import replace_with_bnb_linear
from transformers.utils.quantization_config import BitsAndBytesConfig
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
from peft.tuners.lora.config import LoraConfig
from peft.mapping import get_peft_model
from peft.utils.peft_types import *
import os
import gc
import inspect
from accelerate.utils import set_seed
from functools import partial
from pathlib import Path
Base Model
= Path("profile_snapshots/")
save_dir =True) os.makedirs(save_dir, exist_ok
transformers.logging.set_verbosity_warning()
def malloc_in_gb():
return torch.cuda.memory_allocated()/1e9
def get_model_size_config(model_size):
if model_size == "DEBUG":
= dict(hidden_size=128,
model_size_config =2,
num_hidden_layers=2,
num_attention_heads=2,
num_key_value_heads=256)
intermediate_sizeelif model_size == "60M":
= dict(hidden_size=512,
model_size_config =4,
num_hidden_layers=4,
num_attention_heads=4,
num_key_value_heads=1024)
intermediate_sizeelif model_size == "120M":
= dict(hidden_size=768,
model_size_config =12,
num_hidden_layers=12,
num_attention_heads=12,
num_key_value_heads=1536)
intermediate_sizeelif model_size == "290M":
= dict(hidden_size=1024,
model_size_config =12,
num_hidden_layers=16,
num_attention_heads=16,
num_key_value_heads=4096)
intermediate_sizeelif model_size == "1B":
= dict(hidden_size=2048,
model_size_config =24,
num_hidden_layers=16,
num_attention_heads=16,
num_key_value_heads=4096)
intermediate_sizeelif model_size == "7B":
= {}
model_size_config return model_size_config
def create_model(model_size="1B"):
= get_model_size_config(model_size)
model_size_config # download model weights and config files.
= LlamaConfig()
config
config.update(model_size_config)= LlamaForCausalLM(config)
model return model
def free_memory():
gc.collect() torch.cuda.empty_cache()
print(f"Memory allocated: {malloc_in_gb():.3f} GB")
Memory allocated: 0.000 GB
# create dummy inputs
= create_model("DEBUG")
model = model.model.embed_tokens.weight.size(0)
vocab_size = [torch.randint(0, vocab_size, (1, sl)) for sl in [512,1024,2048,3072]]
inputs print(f"Memory allocated: {malloc_in_gb():.3f} GB")
Memory allocated: 0.000 GB
def profile_model(create_model_func, inference=False, save_filename="mem_profile.pickle"):
"""
https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html#demonstrating-speedups
https://pytorch.org/docs/stable/torch_cuda_memory.html
https://medium.com/pytorch/how-activation-checkpointing-enables-scaling-up-training-deep-learning-models-7a93ae01ff2d
https://pytorch.org/tutorials/intermediate/autograd_saved_tensors_hooks_tutorial.html
"""
42)
set_seed(
torch.cuda.memory._record_memory_history()for x in inputs:
print(f"Input Size:{tuple(x.size())}")
= torch.cuda.Event(enable_timing=True)
start = torch.cuda.Event(enable_timing=True)
end
start.record()if inference:
with torch.no_grad():
= create_model_func()
model "cuda", torch.bfloat16);
model.to(print(f"Memory allocated [MODEL]: {malloc_in_gb():.3f} GB")
= model(x.to("cuda"))
output print(f"Memory allocated [FWD]: {malloc_in_gb():.3f} GB")
else:
= create_model_func()
model "cuda", torch.bfloat16);
model.to(print(f"Memory allocated [MODEL): {malloc_in_gb():.3f} GB")
= model(x.to("cuda"))
output print(f"Memory allocated [FWD]: {malloc_in_gb():.3f} GB")
output.logits.mean().backward()print(f"Memory allocated [BWD]: {malloc_in_gb():.3f} GB")
end.record()
torch.cuda.synchronize()= start.elapsed_time(end) / 1000
secs print(f"Elapsed time: {secs:.3f}\n\n")
= None, None
output, model
free_memory()
torch.cuda.memory._dump_snapshot(save_filename)print(f"Memory allocated [finish]: {malloc_in_gb():.3f} GB")
# warmup
"DEBUG"), inference=True, save_filename=save_dir/"debug-inference.pickle") profile_model(partial(create_model,
Input Size:(1, 512)
Memory allocated [MODEL]: 0.018 GB
Memory allocated [FWD]: 0.093 GB
Elapsed time: 0.562
Input Size:(1, 1024)
Memory allocated [MODEL]: 0.027 GB
Memory allocated [FWD]: 0.160 GB
Elapsed time: 0.111
Input Size:(1, 2048)
Memory allocated [MODEL]: 0.027 GB
Memory allocated [FWD]: 0.291 GB
Elapsed time: 0.096
Input Size:(1, 3072)
Memory allocated [MODEL]: 0.027 GB
Memory allocated [FWD]: 0.425 GB
Elapsed time: 0.104
Memory allocated [finish]: 0.009 GB
"1B"), inference=True, save_filename=save_dir/"base-inference.pickle") profile_model(partial(create_model,
Input Size:(1, 512)
Memory allocated [MODEL]: 2.311 GB
Memory allocated [FWD]: 2.478 GB
Elapsed time: 12.858
Input Size:(1, 1024)
Memory allocated [MODEL]: 2.311 GB
Memory allocated [FWD]: 2.645 GB
Elapsed time: 12.719
Input Size:(1, 2048)
Memory allocated [MODEL]: 2.311 GB
Memory allocated [FWD]: 2.976 GB
Elapsed time: 12.735
Input Size:(1, 3072)
Memory allocated [MODEL]: 2.311 GB
Memory allocated [FWD]: 3.322 GB
Elapsed time: 12.682
Memory allocated [finish]: 0.009 GB
# (1, 4096) OOMs with a 16GB GPU
"1B"), inference=False, save_filename=save_dir/"base-training.pickle") profile_model(partial(create_model,
Input Size:(1, 512)
Memory allocated [MODEL): 2.311 GB
Memory allocated [FWD]: 3.605 GB
Memory allocated [BWD]: 4.764 GB
Elapsed time: 11.823
Input Size:(1, 1024)
Memory allocated [MODEL): 2.320 GB
Memory allocated [FWD]: 4.907 GB
Memory allocated [BWD]: 4.930 GB
Elapsed time: 12.106
Input Size:(1, 2048)
Memory allocated [MODEL): 2.320 GB
Memory allocated [FWD]: 7.493 GB
Memory allocated [BWD]: 5.260 GB
Elapsed time: 12.611
Input Size:(1, 3072)
Memory allocated [MODEL): 2.320 GB
Memory allocated [FWD]: 10.093 GB
Memory allocated [BWD]: 5.606 GB
Elapsed time: 13.033
Memory allocated [finish]: 0.017 GB
LoRA
def create_lora_model(model_size="1B", gc_enabled=False):
= get_model_size_config(model_size)
model_size_config # download model weights and config files.
= LlamaConfig()
config
config.update(model_size_config)= LlamaForCausalLM(config)
model = LoraConfig(
peft_config =TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
task_type
)= get_peft_model(model, peft_config)
model if gc_enabled: model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
return model
"1B"), inference=True, save_filename=save_dir/"lora-inference.pickle") profile_model(partial(create_lora_model,
Input Size:(1, 512)
Memory allocated [MODEL]: 2.323 GB
Memory allocated [FWD]: 2.489 GB
Elapsed time: 12.622
Input Size:(1, 1024)
Memory allocated [MODEL]: 2.323 GB
Memory allocated [FWD]: 2.657 GB
Elapsed time: 12.293
Input Size:(1, 2048)
Memory allocated [MODEL]: 2.323 GB
Memory allocated [FWD]: 2.988 GB
Elapsed time: 12.341
Input Size:(1, 3072)
Memory allocated [MODEL]: 2.323 GB
Memory allocated [FWD]: 3.334 GB
Elapsed time: 12.339
Memory allocated [finish]: 0.017 GB
"1B"), inference=False, save_filename=save_dir/"lora-training.pickle") profile_model(partial(create_lora_model,
Input Size:(1, 512)
Memory allocated [MODEL): 2.323 GB
Memory allocated [FWD]: 3.451 GB
Memory allocated [BWD]: 2.492 GB
Elapsed time: 11.359
Input Size:(1, 1024)
Memory allocated [MODEL): 2.323 GB
Memory allocated [FWD]: 4.580 GB
Memory allocated [BWD]: 2.660 GB
Elapsed time: 11.946
Input Size:(1, 2048)
Memory allocated [MODEL): 2.323 GB
Memory allocated [FWD]: 6.835 GB
Memory allocated [BWD]: 2.991 GB
Elapsed time: 12.710
Input Size:(1, 3072)
Memory allocated [MODEL): 2.323 GB
Memory allocated [FWD]: 9.105 GB
Memory allocated [BWD]: 3.337 GB
Elapsed time: 13.298
Memory allocated [finish]: 0.017 GB
LORA + Gradient Ckpt.
Using default HF grad ckpt strategy which wraps each individual decoder layers.
"1B", gc_enabled=True), inference=False, save_filename=save_dir/"lora-gc-training.pickle") profile_model(partial(create_lora_model,
Input Size:(1, 512)
Memory allocated [MODEL): 2.315 GB
Memory allocated [FWD]: 2.439 GB
Memory allocated [BWD]: 2.392 GB
Elapsed time: 11.923
Input Size:(1, 1024)
Memory allocated [MODEL): 2.323 GB
Memory allocated [FWD]: 2.573 GB
Memory allocated [BWD]: 2.458 GB
Elapsed time: 12.374
Input Size:(1, 2048)
Memory allocated [MODEL): 2.323 GB
Memory allocated [FWD]: 2.820 GB
Memory allocated [BWD]: 2.588 GB
Elapsed time: 12.543
Input Size:(1, 3072)
Memory allocated [MODEL): 2.323 GB
Memory allocated [FWD]: 3.082 GB
Memory allocated [BWD]: 2.733 GB
Elapsed time: 13.120
Memory allocated [finish]: 0.017 GB
QLoRA
def replace_with_bnb_4bit_linear(
model,=None,
modules_to_not_convert=None,
current_key_name=None,
quantization_config=False,
has_been_replaced=torch.uint8,
quant_storage=False,
keep_trainable
):"""
Private method that wraps the recursion for module replacement.
Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
"""
for name, module in model.named_children():
if current_key_name is None:
= []
current_key_name
current_key_name.append(name)
if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert:
# Check if the current key is not in the `modules_to_not_convert`
if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
# with init_empty_weights():
if isinstance(module, Conv1D):
= module.weight.shape
in_features, out_features else:
= module.in_features
in_features = module.out_features
out_features
= bnb.nn.Linear4bit(
model._modules[name]
in_features,
out_features,is not None,
module.bias
quantization_config.bnb_4bit_compute_dtype,=quantization_config.bnb_4bit_use_double_quant,
compress_statistics=quantization_config.bnb_4bit_quant_type,
quant_type=quant_storage
quant_storage
)= True
has_been_replaced # Store the module class in case we need to transpose the weight later
= type(module)
model._modules[name].source_cls # Force requires grad to False to avoid unexpected errors
if keep_trainable:
True)
model._modules[name].requires_grad_(else:
True)
model._modules[name].requires_grad_(if len(list(module.children())) > 0:
= replace_with_bnb_4bit_linear(
_, has_been_replaced
module,
modules_to_not_convert,
current_key_name,
quantization_config,=has_been_replaced,
has_been_replaced
)# Remove the last key for recursion
-1)
current_key_name.pop(return model, has_been_replaced
def create_qlora_model(model_size="1B", with_lora=True, gc_enabled=False, keep_trainable=False):
= get_model_size_config(model_size)
model_size_config
# download model weights and config files.
= LlamaConfig()
config
config.update(model_size_config)= LlamaForCausalLM(config)
model = BitsAndBytesConfig(load_in_4bit=True,
qconfig ="nf4",
bnb_4bit_quant_type=False,
bnb_4bit_use_double_quant=torch.bfloat16)
bnb_4bit_compute_dtype= replace_with_bnb_4bit_linear(model,
model, has_been_replaced =["lm_head"],
modules_to_not_convert=qconfig,
quantization_config=keep_trainable,
keep_trainable=torch.bfloat16)
quant_storageassert has_been_replaced
if with_lora:
= LoraConfig(
peft_config =TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
task_type
)= get_peft_model(model, peft_config)
model if gc_enabled: model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
return model
"1B"), inference=True, save_filename=save_dir/"qlora-inference.pickle") profile_model(partial(create_qlora_model,
Input Size:(1, 512)
Memory allocated [MODEL]: 0.859 GB
Memory allocated [FWD]: 1.034 GB
Elapsed time: 19.783
Input Size:(1, 1024)
Memory allocated [MODEL]: 0.868 GB
Memory allocated [FWD]: 1.201 GB
Elapsed time: 17.461
Input Size:(1, 2048)
Memory allocated [MODEL]: 0.868 GB
Memory allocated [FWD]: 1.532 GB
Elapsed time: 17.779
Input Size:(1, 3072)
Memory allocated [MODEL]: 0.868 GB
Memory allocated [FWD]: 1.878 GB
Elapsed time: 17.819
Memory allocated [finish]: 0.009 GB
"1B"), inference=False, save_filename=save_dir/"qlora-training.pickle") profile_model(partial(create_qlora_model,
Input Size:(1, 512)
Memory allocated [MODEL): 0.868 GB
Memory allocated [FWD]: 2.195 GB
Memory allocated [BWD]: 1.295 GB
Elapsed time: 17.303
Input Size:(1, 1024)
Memory allocated [MODEL): 0.876 GB
Memory allocated [FWD]: 3.532 GB
Memory allocated [BWD]: 1.712 GB
Elapsed time: 17.051
Input Size:(1, 2048)
Memory allocated [MODEL): 0.876 GB
Memory allocated [FWD]: 6.185 GB
Memory allocated [BWD]: 2.542 GB
Elapsed time: 17.963
Input Size:(1, 3072)
Memory allocated [MODEL): 0.876 GB
Memory allocated [FWD]: 8.853 GB
Memory allocated [BWD]: 3.387 GB
Elapsed time: 18.167
Memory allocated [finish]: 0.017 GB
QLORA + Gradient Ckpt.
Using default HF grad ckpt strategy which wraps each individual decoder layer.
"1B", gc_enabled=True), inference=False, save_filename=save_dir/"qlora-gc-training.pickle") profile_model(partial(create_qlora_model,
Input Size:(1, 512)
Memory allocated [MODEL): 0.876 GB
Memory allocated [FWD]: 1.250 GB
Memory allocated [BWD]: 1.194 GB
Elapsed time: 17.265
Input Size:(1, 1024)
Memory allocated [MODEL): 0.876 GB
Memory allocated [FWD]: 1.625 GB
Memory allocated [BWD]: 1.511 GB
Elapsed time: 16.252
Input Size:(1, 2048)
Memory allocated [MODEL): 0.876 GB
Memory allocated [FWD]: 2.371 GB
Memory allocated [BWD]: 2.140 GB
Elapsed time: 17.468
Input Size:(1, 3072)
Memory allocated [MODEL): 0.876 GB
Memory allocated [FWD]: 3.133 GB
Memory allocated [BWD]: 2.783 GB
Elapsed time: 18.704
Memory allocated [finish]: 0.017 GB
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
for x in inputs:
42)
set_seed(= create_qlora_model("1B", gc_enabled=True)
model "cuda", torch.bfloat16);
model.to(with torch.no_grad():
0].to("cuda"))
model(inputs[
= torch.cuda.Event(enable_timing=True)
start = torch.cuda.Event(enable_timing=True)
end
start.record()
torch.cuda.reset_peak_memory_stats()print(f"Memory allocated [MODEL): {malloc_in_gb():.3f} GB")
= model(x.to("cuda"))
output print(f"Memory allocated [FWD]: {malloc_in_gb():.3f} GB")
output.logits.mean().backward()print(f"Memory allocated [BWD]: {malloc_in_gb():.3f} GB")
= torch.cuda.memory.max_memory_allocated()/1e9
max_memory print(f"Max MemAlloc: {max_memory}")
end.record()
torch.cuda.synchronize()= start.elapsed_time(end) / 1000
secs print(f"Elapsed time: {secs:.3f}\n\n")
= None, None
output, model free_memory()
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
Memory allocated [MODEL): 0.882 GB
Memory allocated [FWD]: 1.260 GB
Memory allocated [BWD]: 1.210 GB
Max MemAlloc: 1.360229376
Elapsed time: 0.195
Memory allocated [MODEL): 0.891 GB
Memory allocated [FWD]: 1.646 GB
Memory allocated [BWD]: 1.532 GB
Max MemAlloc: 1.844102144
Elapsed time: 0.194
Memory allocated [MODEL): 0.891 GB
Memory allocated [FWD]: 2.397 GB
Memory allocated [BWD]: 2.174 GB
Max MemAlloc: 2.791502848
Elapsed time: 0.231
Memory allocated [MODEL): 0.891 GB
Memory allocated [FWD]: 3.142 GB
Memory allocated [BWD]: 2.784 GB
Max MemAlloc: 3.733136384
Elapsed time: 0.417