import itertools
import pandas as pd
import numpy as np
from transformers import AutoConfig, AutoModelForCausalLM
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
import os
Experiments
= gspread.oauth() gc
= [
models "model_name":"meta-llama/Llama-2-7b-hf", "model_size":7},
{"model_name":"meta-llama/Llama-2-13b-hf", "model_size":13},
{"model_name":"codellama/CodeLlama-34b-hf", "model_size":34},
{"model_name":"meta-llama/Llama-2-70b-hf", "model_size":70}
{ ]
# for m in models:
# cfg = AutoConfig.from_pretrained(m['model_name'])
# m['config'] = cfg
= [{"seqlen":256}] seqlen
= [{"max_bs":None}] max_bs
= [{"bs":None}] bs
= [{"cpu_offloading":False}, {"cpu_offloading":True}] cpu_offloading
= [{"distrib_type":"FSDP"}, {"distrib_type":"DDP"}] distrib_type
= [{"ft_type":"LoRA"}, {"ft_type":"QLoRA"}] ft_type
# RTX 3090 is not available in cloud providers A5000 also has 24GB memory
= [{"gpu_model":"A5000", "num_gpus":2, "gpu_mem":24, "total_gpu_mem":48, "nvlink":"False"},
gpus "gpu_model":"A100-40", "num_gpus":8, "gpu_mem":40, "total_gpu_mem":320, "nvlink":"True"}] {
= [{"wandb_link":None,
wandb "memory_peak":None,
"memory_after_model_creation":None,
"memory_after_model_wrap":None,
"memory_before_forward":None,
"memory_after_forward":None,
"memory_before_backward":None,
"memory_after_backward":None,
"time_taken":None}]
= [{"use_gradient_checkpointing":True}, {"use_gradient_checkpointing":False}] grad_ckpt
= [models, seqlen, max_bs, bs, grad_ckpt, cpu_offloading, distrib_type, ft_type, gpus, wandb] iters
= list(itertools.product(*iters)) experiments
len(experiments)
def flatten_list_of_dicts(l):
= {}
final_d for d in l:
for k,v in d.items():
if k in final_d:
raise ValueError(f"Key {k} exists.")
= v
final_d[k] return final_d
= [flatten_list_of_dicts(exp) for exp in experiments] experiments_flat
= pd.DataFrame(experiments_flat) df
# exclude lora ddp
= ~((df['ft_type'] == 'LoRA') & (df['distrib_type'] == 'DDP'))
mask # no cpu-offloading with ddp
= np.logical_and(mask, ~((df['cpu_offloading'] == True) & (df['distrib_type'] == 'DDP')))
mask
= df[mask].reset_index(drop=True) df
df.shape
df.head()
# !pip install gspread
# !pip install gspread-dataframe
= "https://docs.google.com/spreadsheets/d/1JSQbnkwtqPgc-_wqI3LTCJI6jWCaWafubK0ontWR2_Y"
url = gc.open_by_url(url) sheet
# this will overwrite the existing sheet!
# use other utils from gspread to add data to specific cells.
= sheet.get_worksheet_by_id(0)
worksheet set_with_dataframe(worksheet, df)
Modify Experiments
Flag experiments based on the theoretical limits excluding the activations.
Note: In DDP script cast all model params to bfloat16 except for RoPE layers.
DDP requires all params, optimizer states, activations to fit into a single GPU.
Compute approx memory requirement per GPU with FSDP full sharing, consider cases with and without CPU offloading.
= "https://docs.google.com/spreadsheets/d/1JSQbnkwtqPgc-_wqI3LTCJI6jWCaWafubK0ontWR2_Y"
url = gc.open_by_url(url) sheet
= sheet.get_worksheet_by_id(0) worksheet
= worksheet.get_all_values()
vals = pd.DataFrame(vals[1:], columns=vals[0]) df
df.shape
df.columns
df.head()
# activation memory per layer: https://arxiv.org/pdf/2205.05198.pdf
= 1
bs = 256
sl = 4096
h = 32
a * sl * h * (34 + 5 * (a * sl / h))) / 1e9 (bs
# exclude optimizer states since lora updates a small fraction of weights
# exclude activations
= []
oom_ignored for row in df.itertuples():
if row.cpu_offloading != 'TRUE':
= int(row.model_size) * 2 / (int(row.num_gpus) if row.distrib_type == 'FSDP' else 1)
approx_mem_req > int(row.total_gpu_mem))
oom_ignored.append(approx_mem_req else:
False) oom_ignored.append(
'oom_ignored'] = oom_ignored df[
'oom_ignored'].mean(), df['oom_ignored'].sum() df[
set_with_dataframe(worksheet, df)
Create Training Commands
= df.query("oom_ignored == 'FALSE' or not oom_ignored") sub_df
df.shape, sub_df.shape
= []
small_gpu_commands = []
large_gpu_commands
for _, row in sub_df.iterrows():
= ["python train.py",
cmd_args "--batch_size 128", # divide by 2 every retry
"--num_epochs 1",
"--dataset alpaca_sample",
"--use_flash_attention",
"--precision bf16_buffers_autocast",
"--log_to wandb",
]
if row.distrib_type == "DDP":
"--use_dpp")
cmd_args.append(elif row.distrib_type == "FSDP":
pass
else:
raise ValueError(f"Unknown distrib_type {distrib_type}")
f"--model_name {row.model_name}")
cmd_args.append(
f"--context_length {row.seqlen}")
cmd_args.append(
if row.use_gradient_checkpointing == "TRUE":
"--use_gradient_checkpointing True")
cmd_args.append(else:
"--use_gradient_checkpointing False")
cmd_args.append(
if row.cpu_offloading == "TRUE":
"--use_cpu_offload")
cmd_args.append(
if row.ft_type == "LoRA":
"--train_type lora")
cmd_args.append(elif row.ft_type == "QLoRA":
"--train_type qlora")
cmd_args.append(else:
raise ValueError(f"Unknown ft_type {ft_type}")
if row.gpu_model == "A100-40":
" ".join(cmd_args))
large_gpu_commands.append(elif row.gpu_model == "A5000":
" ".join(cmd_args))
small_gpu_commands.append(else:
ValueError("Unknown gpu model.")
"../benchmarking", exist_ok=True) os.makedirs(
with open("../benchmarking/small_gpu_benchmarking.sh", "w") as f:
"\n".join(small_gpu_commands))
f.write(
with open("../benchmarking/large_gpu_benchmarking.sh", "w") as f:
"\n".join(large_gpu_commands)) f.write(
Update Sheet with Results
import wandb
= wandb.Api() api
= "https://docs.google.com/spreadsheets/d/1JSQbnkwtqPgc-_wqI3LTCJI6jWCaWafubK0ontWR2_Y"
url = gc.open_by_url(url) sheet
= sheet.get_worksheet_by_id(0) empty_worksheet
= sheet.get_worksheet_by_id(74399953) filled_worksheet
= empty_worksheet.get_all_values()
vals = pd.DataFrame(vals[1:], columns=vals[0]) df
df.shape
df.columns
= "answerdotai/fsdp-benchmarking"
wandb_project
= ['memory_peak', 'memory_after_model_creation',
wandb_cols 'memory_after_model_wrap', 'memory_before_forward',
'memory_after_forward', 'memory_after_backward',
'time_taken']
= pd.Series({c:None for c in wandb_cols}) empty_logs
= []
wandb_logs for row in df.itertuples():
if row.wandb_link == "":
wandb_logs.append(empty_logs)else:
= row.wandb_link.split("runs/")[-1].split("/")[0].split("?")[0]
expid print(row.wandb_link, expid)
= api.run(wandb_project + "/" + expid)
run = run.history()
history_df = list(set(history_df.columns).intersection(wandb_cols))
existing_cols -1e30).max(axis=0)) wandb_logs.append(history_df[existing_cols].fillna(
= pd.concat(wandb_logs, axis=1).T wandb_logs_df
for c in wandb_logs_df.columns:
if c.startswith("memory"):
= wandb_logs_df[c] / 1e9 wandb_logs_df[c]
= wandb_logs_df df[wandb_logs_df.columns]
df.head()
1, 1) set_with_dataframe(filled_worksheet, df,