The following example is of the Slurm batch file that we used for validation of SFT with Llama 2 7B:
#!/bin/bash
# Parameters
#SBATCH --error=%j.err
#SBATCH --gpus-per-node=8
#SBATCH --job-name=singlenode-sft
#SBATCH --nodes=1
#SBATCH --nodelist=node015
#SBATCH --ntasks-per-node=8
#SBATCH --output=%j.out
#SBATCH --partition=slr-9680-hdr
#SBATCH --time=6-00:00:00
TRAIN_DS="[/dataset/data/training.jsonl]"
VALID_DS="[/dataset/data/validation.jsonl]"
TEST_DS="[/dataset/data/test.jsonl]"
VALID_NAMES="[databricks-dolly-15k]"
CONCAT_SAMPLING_PROBS="[1.0]"
TP_SIZE=2
PP_SIZE=1
MODEL="/models/llama2-7b-bf16-tp1.nemo" #llama2-13b-bf16-tp4.nemo" llama2-7b-bf16-tp1.nemo
export CUDA_LAUNCH_BLOCKING=1
module load docker/20.10.25
srun --container-mounts=/powerscale-share/prem_sft/results/${SLURM_JOBID}/:/results,/dev/infiniband/:/dev/infiniband,/powerscale-share/llm/datasets/databricks-dolly-15k/:/dataset,/powerscale-share/llm/models/NeMo-models/llama2-NGC-v2.0/:/models --container-image="docker://nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.03" bash -c "python /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py \
trainer.precision=bf16 \
trainer.devices=8 \
trainer.num_nodes=1 \
trainer.val_check_interval=0.5 \
trainer.max_steps=50 \
model.restore_from_path=${MODEL} \
model.micro_batch_size=1 \
model.global_batch_size=128 \
model.tensor_model_parallel_size=${TP_SIZE} \
model.pipeline_model_parallel_size=${PP_SIZE} \
model.megatron_amp_O2=True \
model.sequence_parallel=True \
model.activations_checkpoint_granularity=selective \
model.activations_checkpoint_method=uniform \
model.optim.name=distributed_fused_adam \
model.optim.lr=5e-6 \
model.answer_only_loss=True \
model.data.train_ds.file_names=${TRAIN_DS} \
model.data.validation_ds.file_names=${VALID_DS} \
model.data.test_ds.file_names=${TEST_DS} \
model.data.train_ds.concat_sampling_probabilities=${CONCAT_SAMPLING_PROBS} \
model.data.train_ds.max_seq_length=2048 \
model.data.validation_ds.max_seq_length=2048 \
model.data.train_ds.micro_batch_size=1 \
model.data.train_ds.global_batch_size=128 \
model.data.validation_ds.micro_batch_size=1 \
model.data.validation_ds.global_batch_size=128 \
model.data.test_ds.micro_batch_size=1 \
model.data.test_ds.global_batch_size=256 \
model.data.train_ds.num_workers=0 \
model.data.validation_ds.num_workers=0 \
model.data.test_ds.num_workers=0 \
model.data.validation_ds.metric.name=loss \
model.data.test_ds.metric.name=loss \
exp_manager.exp_dir=/results \
exp_manager.create_checkpoint_callback=True \
exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True"