The following example is of the Slurm batch file that we used for validation of LoRA with Llama 2 70B:
#!/bin/bash
# Parameters
#SBATCH --error=%j.err
#SBATCH --gpus-per-node=8
#SBATCH --job-name=peft-lora
#SBATCH --nodes=2
#SBATCH --nodelist=node0[12-13]
#SBATCH --ntasks-per-node=8
#SBATCH --output=%j.out
#SBATCH --partition=defq
#SBATCH --time=6-00:00:00
TRAIN_DS="[/datasets/training.jsonl]"
VALID_DS="[/datasets/validation.jsonl]"
TEST_DS="[/datasets/test.jsonl]"
VALID_NAMES="[databricks-dolly-15k]"
RESTORE_PATH="/home/user/helix_output/"
CONCAT_SAMPLING_PROBS="[1.0]"
TP_SIZE=8
PP_SIZE=1
MODEL="/models/llama2-70b-bf16.nemo" #"/models/llama2-7b-bf16-tp1.nemo" #"/models/llama2-13b-bf16-tp4.nemo" # #/models/llama2-70b-bf16.nemo
export HYDRA_FULL_ERROR=1
export NCCL_IB_HCA=mlx5_0,mlx5_3,mlx5_10,mlx5_11,mlx5_4,mlx5_5,mlx5_6,mlx5_9
export NCCL_IBEXT_DISABLE=1
export NCCL_DEBUG=INFO
export NCCL_IGNORE_CPU_AFFINITY=1
export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))
export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
echo "WORLD_SIZE="$WORLD_SIZE
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
echo "MASTER_ADDR="$MASTER_ADDR
module load docker
module load cuda-dcgm/3.1.3.1
module load cuda12.2/toolkit/12.2.1
srun --container-mounts=/dev/infiniband/:/dev/infiniband,/home/user/:/workspace,/powerscale-share/llm/models/NeMo-models/llama2-NGC-v2.0/:/models,/powerscale-share/llm/docker_tmp/tmp70b/:/tmp,/powerscale-share/llm/docker_tmp/var70b:/var,/powerscale-share/user/results/${SLURM_JOBID}:/results,/powerscale-share/llm/datasets/databricks-dolly-15k/data/:/datasets --container-image="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.03" bash -c "export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && nvidia-smi topo -m && nvidia-smi && python /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py \
trainer.devices=8 \
trainer.num_nodes=2 \
trainer.precision=bf16 \
trainer.val_check_interval=30 \
trainer.max_steps=50 \
model.megatron_amp_O2=False \
++model.mcore_gpt=True \
exp_manager.create_wandb_logger=False \
exp_manager.resume_if_exists=True \
exp_manager.explicit_log_dir=/results \
exp_manager.resume_ignore_no_checkpoint=True \
exp_manager.create_checkpoint_callback=True \
exp_manager.checkpoint_callback_params.monitor=validation_loss \
exp_manager.checkpoint_callback_params.save_best_model=False \
exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True \
model.tensor_model_parallel_size=${TP_SIZE} \
model.pipeline_model_parallel_size=${PP_SIZE} \
model.micro_batch_size=1 \
model.global_batch_size=8 \
model.restore_from_path=${MODEL} \
model.data.train_ds.num_workers=0 \
model.data.validation_ds.num_workers=0 \
model.data.test_ds.num_workers=0 \
model.data.train_ds.file_names=${TRAIN_DS} \
model.data.train_ds.concat_sampling_probabilities=[1.0] \
model.data.validation_ds.file_names=${VALID_DS} \
model.peft.peft_scheme='lora' \
model.data.train_ds.max_seq_length=2048 \
model.data.validation_ds.max_seq_length=2048" #\
# model.optim.lr=0.001 \
# model.peft.p_tuning.virtual_tokens=10"