diff --git a/custom_litgpt_dataloader/__init__.py b/custom_litgpt_dataloader/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/custom_litgpt_dataloader/data_util.py b/custom_litgpt_dataloader/data_util.py new file mode 100644 index 0000000000..36da5ae70c --- /dev/null +++ b/custom_litgpt_dataloader/data_util.py @@ -0,0 +1,96 @@ +from lit_gpt.packed_dataset import CombinedDataset, PackedDataset +from lit_gpt.utils import CycleIterator +from torch.utils.data import DataLoader +from pathlib import Path +from axonn import axonn as ax +import torch.distributed as dist +from typing import Tuple, Union, Optional + +data_config = [ + ("train_slimpajama", 69.3584), + ("train_starcoder", 30.6), + # 0.693584, 0.306416) + # ("c4", 15.0), + # ("cc", 67.0), + # ("github", 4.5), + # ("stackexchange", 2.0), + # ("wikipedia", 4.5), +] + + +def create_dataloader( + batch_size: int, block_size: int, data_dir: Path, shuffle: bool = True, seed: int = 12345 +) -> DataLoader: + datasets = [] + for prefix, _ in data_config: + filenames = list(data_dir.glob(f"{prefix}*")) + if not filenames: + raise FileNotFoundError( + f"No files found at {str(data_dir)} with prefix {prefix}. Did you forget to run `prepare_redpajama.py`?" + ) + dataset = PackedDataset( + filenames, + n_chunks=4, + block_size=block_size, + shuffle=shuffle, + seed=seed, + num_processes=ax.config.G_data, + process_rank=ax.config.data_parallel_rank, + ) + datasets.append(dataset) + + if not datasets: + raise RuntimeError( + f"No data found at {data_dir}. Make sure you ran prepare_redpajama.py to create the dataset." + ) + + weights = [weight for _, weight in data_config] + sum_weights = sum(weights) + weights = [el / sum_weights for el in weights] + + #having different seeds here is important such that each batch has tokens + #from all data mixtures. + combined_dataset = CombinedDataset(datasets=datasets, seed=seed, weights=weights) + + return DataLoader(combined_dataset, batch_size=batch_size, shuffle=False, pin_memory=True) + + +def create_dataloaders( + batch_size: int, + block_size: int, + train_data_dir: str, + val_data_dir: str, + seed: int = 12345, #this seed is independent of megatron's seeds +) -> Tuple[DataLoader, DataLoader]: + # Increase by one because we need the next word as well + train_data_dir = Path(train_data_dir) + val_data_dir = Path(val_data_dir) + effective_block_size = block_size + 1 + train_dataloader = create_dataloader( + batch_size=batch_size, + block_size=effective_block_size, + data_dir=train_data_dir, + shuffle=True, + seed=seed, + ) + val_dataloader = ( + create_dataloader( + batch_size=batch_size, + block_size=effective_block_size, + data_dir=val_data_dir, + shuffle=False, + seed=seed, + ) + if val_data_dir + else None + ) + return CycleIterator(train_dataloader), CycleIterator(val_dataloader) + +if __name__ == "__main__": + ax.init(G_inter=1, G_data=1, G_intra_r=8) + train_loader, val_loader = create_dataloaders( + batch_size=32, + block_size=1024, #sequence length + ) + data = next(train_loader) + print(dist.get_rank(), ":", data.view(-1)[:5]) diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000000..073b601ea6 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,40 @@ +# How to setup on frontier + +## Installing all dependencies +Note that this is a python virtual environment based setup +You might need to change this a bit for conda + +Also this assumes that you are starting from scratch and have no venv/conda +environment enabled. + +We are going to install everything on scratch. + +``` +cd /lustre/orion/scratch/$(whoami)/csc569/ +bash install_everything_on_frontier.sh +``` + +This should work, let Siddharth know if it doesn't + +## Training TinyLLaMA +First checkout the tiny-llama branch of Megatron-AxoNN. +Then open `examples/run_axonn_amd_tinyllama.sh`, and change the following + +``` +# These are the two things you need to change as per your setup +# 1. Make LD_LIBRARY_PATH point to wherever your plugin is installed +# this enables the slingshot-11 plugin for RCCL (crucial for inter-node bw) +export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/path/to/aws-ofi-rccl/build/lib" +# 2. Make PYTHONPATH point to your local clone of litgpt +export PYTHONPATH="$PYTHONPATH:/path/to/lit-gpt-dev" +``` + +Now you are ready to train. +To launch on 16 nodes (128 GPUs) for 2 hours: +``` +## checkout the tiny-llama branch +sbatch -N 128 -o /path/to/output/file -t 02:00:00 examples/run_axonn_amd_tinyllama.sh +``` + + + diff --git a/examples/get_rank_from_slurm.sh b/examples/get_rank_from_slurm.sh new file mode 100755 index 0000000000..881bdc4a0e --- /dev/null +++ b/examples/get_rank_from_slurm.sh @@ -0,0 +1,4 @@ +#!/bin/bash +# select_gpu_device wrapper script +export RANK=${SLURM_PROCID} +exec $* diff --git a/examples/install_everything_on_frontier.sh b/examples/install_everything_on_frontier.sh new file mode 100644 index 0000000000..ac7359d65e --- /dev/null +++ b/examples/install_everything_on_frontier.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Setup Virtual Environment +echo "Setting up Virtual Environment" +module load cray-python +python -m venv ./my-venv --system-site-packages +cd my-venv +. bin/activate + +# PyTorch +echo "Installing PyTorch" +pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6 + +module load amd-mixed/5.6.0 +module load PrgEnv-cray + +# mpi4py +echo "Installing mpi4py" +module load craype-accel-amd-gfx90a +export MPICH_GPU_SUPPORT_ENABLED=1 +export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CRAY_MPICH_ROOTDIR}/gtl/lib" +echo ${LD_LIBRARY_PATH} + +MPICC=CC python -m pip install --ignore-installed --no-cache-dir mpi4py + +# Flash Attention +echo "Installing Flash Attention" +git clone https://github.com/ROCmSoftwarePlatform/flash-attention +cd flash-attention +vi setup.py -c ':%s/c++20/c++17/g' -c ':wq' +CC=cc CXX=CC PYTORCH_ROCM_ARCH='gfx90a' GPU_ARCHS='gfx90a' pip install -v . + +# Apex +echo "Installing Apex" +cd .. +git clone https://github.com/ROCmSoftwarePlatform/apex +cd apex +git checkout release/1.1.0 +CC=cc CXX=CC PYTORCH_ROCM_ARCH='gfx90a' GPU_ARCHS='gfx90a' python setup.py install --cpp_ext --cuda_ext + +# RCCL Plugin +echo "Installing RCCL Plugin" +cd .. +git clone https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl +cd aws-ofi-rccl +module load libtool +./autogen.sh +CC=cc CXX=CC ./configure --with-libfabric=/opt/cray/libfabric/1.15.0.0 --with-hip=/opt/rocm-5.6.0/ --with-rccl="$(dirname "$(pwd)")"/lib/python3.9/site-packages/torch/lib/ --prefix="$(dirname "$(pwd)")"/aws-ofi-rccl/build/ +CC=cc CXX=CC make -j install + +cd .. + +# AxoNN +echo "Installing AxoNN" +git clone https://github.com/axonn-ai/axonn.git +cd axonn +pip install -e . + +cd .. + +# Megatron-AxoNN +echo "Installing Megatron-AxoNN" +git clone https://github.com/axonn-ai/Megatron-AxoNN.git + +pip install regex + +echo "Done!" + diff --git a/examples/run_axonn_amd.sh b/examples/run_axonn_amd.sh deleted file mode 100755 index ec3bf26923..0000000000 --- a/examples/run_axonn_amd.sh +++ /dev/null @@ -1,136 +0,0 @@ -#!/bin/bash - -# Runs the "345M" parameter model - -module load cray-python -. /lustre/orion/scratch/ssingh37/csc547/venv_axonn_pt_2.1/bin/activate -module load amd-mixed/5.6.0 #this should match with the rocm version your pytorch uses - -## these lines enable CUDA aware MPI -module load craype-accel-amd-gfx90a -export MPICH_GPU_SUPPORT_ENABLED=0 -export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CRAY_MPICH_ROOTDIR}/gtl/lib" - -## this enables the slingshot-11 plugin for RCCL (crucial for inter-node bw) -export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/lustre/orion/scratch/ssingh37/csc547/aws-ofi-rccl/build/lib" -#export NCCL_DEBUG=INFO -export FI_CXI_ATS=0 - -## this improves cross node bandwidth for some cases -export NCCL_CROSS_NIC=1 - -export CUDA_DEVICE_MAX_CONNECTIONS=1 - -NNODES=$SLURM_JOB_NUM_NODES -GPUS_PER_NODE=8 ## change as per your machine -GPUS=$(( NNODES * GPUS_PER_NODE )) - -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 - -# data/checkpoint args -DATA_DIR="/lustre/orion/csc547/proj-shared/parallel_deep_learning/book_corpus" - - -CHECKPOINT_PATH="${DATA_DIR}/checkpoints" -VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json" -MERGE_FILE="${DATA_DIR}/gpt2-merges.txt" -DATA_PATH="${DATA_DIR}/BookCorpusDataset_text_document" - -## ARCHITECTURE DETAILS -# 20B -NUM_LAYERS=32 -HIDDEN_SIZE=7168 -NUM_HEADS=56 - -# 40B -NUM_LAYERS=38 -HIDDEN_SIZE=9216 -NUM_HEADS=72 - -## PARALLELISM DETAILS -COLUMN_TENSOR_PARR=1 -ROW_TENSOR_PARR=2 -DEPTH_TENSOR_PARR=256 -PIPE_PARR=1 -CACHE_LAYERS=25 -OVERLAP=True - -## BATCH SIZES -MICRO_BATCH_SIZE=2048 -GLOBAL_BATCH_SIZE=2048 -SEQUENCE_LENGTH=2048 -TRAIN_ITERS=10 - -GPT_ARGS=" - --row-tensor-model-parallel-size ${ROW_TENSOR_PARR} \ - --column-tensor-model-parallel-size ${COLUMN_TENSOR_PARR} \ - --depth-tensor-model-parallel-size ${DEPTH_TENSOR_PARR} \ - --pipeline-model-parallel-size ${PIPE_PARR} \ - --num-layers ${NUM_LAYERS} \ - --hidden-size ${HIDDEN_SIZE} \ - --num-attention-heads ${NUM_HEADS} \ - --seq-length ${SEQUENCE_LENGTH} \ - --max-position-embeddings ${SEQUENCE_LENGTH} \ - --micro-batch-size ${MICRO_BATCH_SIZE} \ - --global-batch-size ${GLOBAL_BATCH_SIZE} \ - --lr 0.00015 \ - --train-iters ${TRAIN_ITERS} \ - --lr-decay-iters 320000 \ - --lr-decay-style cosine \ - --min-lr 1.0e-5 \ - --weight-decay 1e-2 \ - --lr-warmup-fraction .01 \ - --clip-grad 1.0 \ - --bf16 \ - --no-gradient-accumulation-fusion \ - --use-amd \ - --recompute-granularity full \ - --recompute-method uniform \ - --recompute-num-layers 1 \ - --use-flash-attn \ -" -# --no-gradient-accumulation-fusion is neede on AMD -# --use-amd disables features incompatible with AMD - - -if [[ $OVERLAP == "True" ]] -then - GPT_ARGS="${GPT_ARGS} \ - --overlap-axonn-comm \ - --overlap-axonn-reduce-scatter \ - --overlap-axonn-all-gather\ - --num-layers-for-caching-weights-in-depth-tensor-parallel-all-gather ${CACHE_LAYERS}" -fi - -DATA_ARGS=" - --data-path $DATA_PATH \ - --vocab-file $VOCAB_FILE \ - --merge-file $MERGE_FILE \ - --split 949,50,1 -" - -OUTPUT_ARGS=" - --log-interval 1 \ - --save-interval 10000 \ - --eval-interval 1000 \ - --eval-iters 1 -" - -SCRIPT="python -u pretrain_gpt.py \ - $GPT_ARGS \ - $DATA_ARGS \ - $OUTPUT_ARGS \ - --distributed-backend nccl \ -" - - #--save $CHECKPOINT_PATH \ - #--load $CHECKPOINT_PATH - - -export OMP_NUM_THREADS=7 -run_cmd="srun -N ${NNODES} -n ${GPUS} -c7 --gpus-per-task=1 --gpu-bind=closest ${SCRIPT}" - -echo ${run_cmd} -eval ${run_cmd} -set +x diff --git a/examples/run_axonn_amd_tinyllama.sh b/examples/run_axonn_amd_tinyllama.sh new file mode 100755 index 0000000000..e7a491b23d --- /dev/null +++ b/examples/run_axonn_amd_tinyllama.sh @@ -0,0 +1,249 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -C nvme + +# tiny_llama = [ +# dict( +# name="tiny-llama-1.1b{}", +# hf_config=dict(org="TinyLlama", name="TinyLlama-1.1B{}"), +# block_size=2048, #### CHECKED (Seq Length) +# vocab_size=32000, #### NEED TO ADD VOCAB FILES +# padding_multiple=64, ### NOT RELEVANT +# n_layer=22, ### CHECKED (NUM_LAYERS) +# n_head=32, ### CHECKED (NUM_HEADS) +# n_embd=2048, ### CHECKED (HIDDEN_SIZE) +# rotary_percentage=1.0, ### TODO: CHECK IF -- USING ROTARY (I think the default is 1: https://github.com/search?q=repo%3Aaxonn-ai%2FMegatron-AxoNN+rotary_percent&type=code) +# parallel_residual=False, ### TODO: CHECK (https://github.com/azshue/lit-gpt-dev/blob/99fb9363646bfacb686f72f58274392e6036ad6c/lit_gpt/model.py#L157 and apply_residual_connection_post_layernorm are the same) +# bias=False, ### TODO: CHECK "disable-bias-linear" I think. This is the bias for the linear layers +# _norm_class="RMSNorm", ### CHECKED "--normalization RMSNorm" +# norm_eps=1e-5, ### TODO: UNLCLEAR WHERE THIS IS -- I think this is fine (https://github.com/search?q=repo%3Aaxonn-ai%2FMegatron-AxoNN%20norm_eps&type=code) +# _mlp_class="LLaMAMLP", ### CHECKED "From Line 112, # --swiglu makes ParallelMLP equivalent to LLAMAMLP" +# intermediate_size=5632, ### CHECKED "FFN_HIDDEN_SIZE" +# n_query_groups=4, #### CHECKED: NUM_QUERY_GROUPS +# ) +# ] +### WE want global batch size of 4M so 4000000/2048 +#### We are gonna copy Olma's BS of 4M +# global_batch_size = 2048 #NEEL: UPDATED IN BASH SCRIPT +# learning_rate = 4e-4 #NEEL: Checked "--lr 4.0e-4" +#### THIS COULD BE SET ACCORDING TO HOW MANY GPUs we want to use +# micro_batch_size = 8 +# max_tokens = int(1e12) #NEEL: UPDATED IN BASH SCRIPT +# warmup_steps = 2000 # We are gonna use tinyllama warmup steps +#### BELOW ARE IRRELVANT #### +# log_step_interval = 1 +# eval_iters = 100 +# save_step_interval = 1000 +# eval_step_interval = 1000 +#### ABOVE ARE IRRELVANT #### + +# weight_decay = 1e-1 ### Neel: CHECKED "weight-decay 1e-1" +# beta1 = 0.9 ### Neel: CHECKED +# beta2 = 0.95 ### Neel: CHECKED +# grad_clip = 1.0 ### Neel: CHECKED +# decay_lr = True <--- This is irrevalant +# min_lr = 4e-5 ### Neel: CHECKED + +## calculating the number of nodes and GPUs +NNODES=$SLURM_JOB_NUM_NODES +GPUS_PER_NODE=8 ## change as per your machine +GPUS=$(( NNODES * GPUS_PER_NODE )) + +userid=$(whoami) +# These are the two things you need to change as per your setup +# 1. Make LD_LIBRARY_PATH point to wherever your plugin is installed +# this enables the slingshot-11 plugin for RCCL (crucial for inter-node bw) +export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/ccs/home/$userid/aws-ofi-rccl/build/lib" +# 2. Make PYTHONPATH point to your local clone of litgpt +export PYTHONPATH="$PYTHONPATH:/lustre/orion/scratch/$userid/csc547/lit-gpt-dev" + + +# This blob is setting up my python venv, ignore for conda builds +echo "moving environment to burst buffer" +## load venv onto burst buffer +srun -N $NNODES --ntasks-per-node=1 prepare_venv.sh +## delete old symbolic link +rm -rf ~/axonn_venv +## create new symbolic link +ln -s /mnt/bb/ssingh37/axonn_venv ~/axonn_venv +module load PrgEnv-cray +module load cray-python/3.9.13.1 +. /ccs/home/$userid/axonn_venv/bin/activate + + +module load amd-mixed/5.6.0 #this should match with the rocm version your pytorch uses +module load libfabric + +export MPICH_GPU_SUPPORT_ENABLED=0 + +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 +export NCCL_CROSS_NIC=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 + + +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=${GPUS} + +# train_data_dir and val_data_dir are set to this as of now +DATADIR="/lustre/orion/csc569/proj-shared/language_datasets/" +DATASET="spj_star_combined_full_tinyllama_tokd" +DATAPATH="$DATADIR/$DATASET" + +TOKENIZER_DIR="/lustre/orion/csc569/proj-shared/megatron-axonn-tiny-llama-1.1b/llama-tokenizer" +TOKENIZER_MODEL="${TOKENIZER_DIR}/tokenizer.model" + +# we will save and load model checkpoints here +# if these are non-empty training will restart from the latest checkpoint here +# else training will start from scratch +CHECKPOINT_PATH="/lustre/orion/csc569/proj-shared/megatron-axonn-tiny-llama-1.1b/checkpoints/dataloader_correction" + +# tiny-llama1.1B architecture shapes +# https://github.com/azshue/lit-gpt-dev/blob/tiny-llama/lit_gpt/config.py +NUM_LAYERS=22 +NUM_HEADS=32 +HIDDEN_SIZE=2048 +FFN_HIDDEN_SIZE=5632 +NUM_QUERY_GROUPS=4 + +# batch size, seq length, and iterations +GLOBAL_BATCH_SIZE=2048 ## Neel: 2048x2048 = 4M per batch +SEQUENCE_LENGTH=2048 +TOKENS_IN_BILLIONS=1000 ### Neel: Changed 1T ##### +TRAIN_ITERS=$(( TOKENS_IN_BILLIONS * 1000000000 / GLOBAL_BATCH_SIZE / SEQUENCE_LENGTH + 100 )) +echo "Number of training iterations : ${TRAIN_ITERS}" + +## AxoNN parallelism args +## These do not affect the science +ROW_TENSOR_PARR=1 +COLUMN_TENSOR_PARR=1 +DEPTH_TENSOR_PARR=1 +PIPE_PARR=1 +CACHE_LAYERS=0 +OVERLAP=True + + +GRAD_ACC=2 +GRADIENT_CHECKPOINT=False + +## DERIVED ARGUMENTS (ignore) +MP=$(( ROW_TENSOR_PARR * COLUMN_TENSOR_PARR * DEPTH_TENSOR_PARR )) +DP=$(( GPUS / MP )) +MICRO_BATCH_SIZE=$(( GLOBAL_BATCH_SIZE / DP / GRAD_ACC )) + +# The following args enable LLaMA +# --swiglu makes ParallelMLP equivalent to LLAMAMLP +# --group-query-attention - enables group query attention +# --num-query-groups - number of query groups for group query attention +# --normalization RMSNorm - switch from layernorm to RMSNorm (someone confirm?) +# --use-rotary-position-embeddings - use RoPE embeddings instead of learned position embeddings +# --untie-embeddings-and-output-weights - untie embedding and last layer weights +# --disable-bias-linear - disables bias in all nn.linear layers + +# The following args disable features not compatible with AMD +# --no-gradient-accumulation-fusion +# --use-amd + +GPT_ARGS=" + --row-tensor-model-parallel-size ${ROW_TENSOR_PARR} \ + --column-tensor-model-parallel-size ${COLUMN_TENSOR_PARR} \ + --depth-tensor-model-parallel-size ${DEPTH_TENSOR_PARR} \ + --pipeline-model-parallel-size ${PIPE_PARR} \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --num-attention-heads ${NUM_HEADS} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --seq-length ${SEQUENCE_LENGTH} \ + --max-position-embeddings ${SEQUENCE_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --lr 4.0e-4 \ + --train-iters ${TRAIN_ITERS} \ + --lr-decay-iters ${TRAIN_ITERS} \ + --lr-decay-style cosine \ + --min-lr 4.0e-5 \ + --weight-decay 1e-1 \ + --lr-warmup-iters 2000 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --bf16 \ + --no-gradient-accumulation-fusion \ + --use-amd \ + --use-flash-attn \ + --swiglu \ + --use-rotary-position-embeddings \ + --normalization RMSNorm \ + --group-query-attention \ + --num-query-groups ${NUM_QUERY_GROUPS} \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --use-apex-adam \ + --seed 78965 \ + --attention-dropout 0 \ + --hidden-dropout 0 +" + +if [[ $GRADIENT_CHECKPOINT == "True" ]] +then + GPT_ARGS="${GPT_ARGS} --recompute-granularity full \ + --recompute-method uniform \ + --recompute-num-layers 1" +fi + +## AxoNN specific args for communication optimizations +# these do not affect the ML science +if [[ $OVERLAP == "True" ]] +then + GPT_ARGS="${GPT_ARGS} \ + --overlap-axonn-comm \ + --overlap-axonn-reduce-scatter \ + --overlap-axonn-all-gather\ + --num-layers-for-caching-weights-in-depth-tensor-parallel-all-gather ${CACHE_LAYERS}" +fi + +# --lit-gpt-data-path - is pointing to your dataset +# currently both train and val splits are taken fron --data-path +# the --custom-dataloader argument bypasses megatron's dataloaders +# --num-workers 0 - disables multiprocesses dataloading +# which can hang jobs at scale + +DATA_ARGS=" + --lit-gpt-data-path $DATAPATH \ + --custom-dataloader \ + --num-workers 0 \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} +" + +# --eval-interval 1000 - do validation after every 1000 arguments +# --eval-iters 100 - do validation for 100 iterations +# --save-interval 1000 - save the model after every 1000 iterations +# --log-interval 1 - print iteration lossees after every 1 iteration +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 1000 \ + --eval-interval 1000 \ + --eval-iters 100 \ +" + +SCRIPT="python -u pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH +" + + +export OMP_NUM_THREADS=7 +run_cmd="srun -N ${NNODES} -n ${GPUS} -c7 --gpus-per-task=1 --gpu-bind=closest ./examples/get_rank_from_slurm.sh ${SCRIPT}" + +echo ${run_cmd} +eval ${run_cmd} +set +x diff --git a/megatron/arguments.py b/megatron/arguments.py index 7f975bad14..ef92ffd273 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -422,6 +422,12 @@ def core_transformer_config_from_args(args): if args.init_method_xavier_uniform: kw_args['init_method'] = torch.nn.init.xavier_uniform_ kw_args['scaled_init_method'] = torch.nn.init.xavier_uniform_ + elif args.init_method_tiny_llama: + from megatron.core.utils import init_method_normal + import math + kw_args['init_method'] = init_method_normal(math.sqrt(2.0 / 5 / args.hidden_size)) + kw_args['output_layer_init_method'] = init_method_normal(1 / math.sqrt(args.hidden_size) / args.num_layers ) + if args.group_query_attention: kw_args['num_query_groups'] = args.num_query_groups else: @@ -699,6 +705,8 @@ def _add_regularization_args(parser): 'numerical stability') group.add_argument('--sgd-momentum', type=float, default=0.9, help='Momentum factor for sgd') + group.add_argument('--use-apex-adam', action='store_true', default=False, + help="Use Apex's implementation of Adam") return parser @@ -861,6 +869,8 @@ def _add_initialization_args(parser): 'distribution used for weight initialization.') group.add_argument('--init-method-xavier-uniform', action='store_true', help='Enable Xavier uniform parameter initialization') + group.add_argument('--init-method-tiny-llama', action='store_true', + help='Enable Tiny LLaMA based initialization') return parser @@ -1081,6 +1091,8 @@ def _add_validation_args(parser): def _add_data_args(parser): group = parser.add_argument_group(title='data and dataloader') + group.add_argument('--custom-dataloader', help="using custom dataloader, bypass megatron's" + "dataset/dataloader creation", action='store_true') group.add_argument('--data-path', nargs='*', default=None, help='Path to the training dataset. Accepted format:' '1) a single data path, 2) multiple datasets in the' @@ -1162,6 +1174,9 @@ def _add_data_args(parser): group.add_argument('--eod-mask-loss', action='store_true', help='Mask loss for the end of document tokens.') + ## add separate argument for lit gpt data paths + group.add_argument('--lit-gpt-data-path', type=str, + help="data path for custom lit gpt dataloaders") return parser diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py index b09879bd83..052b014f21 100644 --- a/megatron/core/tensor_parallel/layers.py +++ b/megatron/core/tensor_parallel/layers.py @@ -582,6 +582,8 @@ def __init__( keep_master_weight_for_test=False, skip_bias_add=False, skip_weight_param_allocation: bool = False, + for_embedding_and_clf_layer: bool = False + ): super(ColumnParallelLinear, self).__init__() @@ -590,10 +592,11 @@ def __init__( self.output_size = output_size self.gather_output = gather_output # Divide the weight matrix along the last dimension. - world_size = get_tensor_model_parallel_world_size() + world_size = get_tensor_model_parallel_world_size(for_embedding_and_clf_layer=for_embedding_and_clf_layer) self.output_size_per_partition = divide(output_size, world_size) self.skip_bias_add = skip_bias_add self.config = config + self.for_embedding_and_clf_layer = for_embedding_and_clf_layer # Parameters. # Note: torch.nn.functional.linear performs XA^T + b and as a result @@ -601,6 +604,7 @@ def __init__( # Initialize weight. if not skip_weight_param_allocation: if config.use_cpu_initialization: + raise NotImplementedError self.weight = Parameter( torch.empty( self.output_size_per_partition, self.input_size, dtype=config.params_dtype @@ -628,7 +632,8 @@ def __init__( ) if config.perform_initialization: _initialize_affine_weight_gpu( - self.weight, init_method, partition_dim=0, stride=stride + self.weight, init_method, partition_dim=0, stride=stride, + for_embedding_and_clf_layer=self.for_embedding_and_clf_layer ) else: self.weight = None @@ -724,6 +729,7 @@ def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None): input_parallel = copy_to_tensor_model_parallel_region(input_) # Matrix multiply. if not weight.requires_grad: + raise NotImplementedError self._forward_impl = linear_with_frozen_weight else: self._forward_impl = linear_with_grad_accumulation_and_async_allreduce @@ -734,6 +740,7 @@ def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None): gradient_accumulation_fusion=self.gradient_accumulation_fusion, async_grad_allreduce=self.async_tensor_model_parallel_allreduce, sequence_parallel=self.sequence_parallel, + for_embedding_and_clf_layer=self.for_embedding_and_clf_layer ) if self.gather_output: # All-gather across the partitions. diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py index bb1af9b9e7..20d7bfde4c 100644 --- a/megatron/model/gpt_model.py +++ b/megatron/model/gpt_model.py @@ -58,7 +58,6 @@ def __init__(self, self.post_process = post_process self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights - assert not self.untie_embeddings_and_output_weights, "Megatron-AxoNN doesn't support untied embedding yet" self.language_model, self._language_model_key = get_language_model( config=config, diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 37e225a54f..72b0e1ec85 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -95,6 +95,7 @@ def __init__(self, config, layer_number): out_features=ffn_hidden_size, skip_bias_add=True, init_method=config.init_method, + bias = self.add_bias ) self.bias_gelu_fusion = False @@ -125,13 +126,19 @@ def squared_relu(x): skip_bias_add=True, init_method=config.output_layer_init_method, transpose=True, + bias = self.add_bias ) def forward(self, hidden_states): torch.cuda.nvtx.range_push(f"MLP Block") # [s, b, 4hp] - intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states, scatter_input=False, gather_output=False, - cache_weights_in_all_gather = self.cache_weights_in_all_gather) + output = self.dense_h_to_4h(hidden_states, scatter_input=False, gather_output=False, + cache_weights_in_all_gather = self.cache_weights_in_all_gather) + + if isinstance(output, tuple): + intermediate_parallel, bias_parallel = output + else: + intermediate_parallel, bias_parallel = output, None if self.bias_gelu_fusion: assert self.add_bias is True @@ -143,8 +150,14 @@ def forward(self, hidden_states): intermediate_parallel = self.activation_func(intermediate_parallel) # [s, b, h] - output, output_bias = self.dense_4h_to_h(intermediate_parallel, scatter_input=False, gather_output=False, + output = self.dense_4h_to_h(intermediate_parallel, scatter_input=False, gather_output=False, cache_weights_in_all_gather = self.cache_weights_in_all_gather) + + if isinstance(output, tuple): + output, output_bias = output + else: + output, output_bias = output, None + torch.cuda.nvtx.range_pop() return output, output_bias @@ -463,8 +476,8 @@ def __init__(self, config, layer_number, self.query_key_value = Linear( in_features=config.hidden_size, out_features=query_projection_size + 2 * kv_projection_size, - skip_bias_add=True, - init_method=config.init_method) + init_method=config.init_method, + bias=args.add_bias_linear) else: raise NotImplementedError assert attention_type == AttnType.cross_attn @@ -504,7 +517,9 @@ def __init__(self, config, layer_number, out_features=config.hidden_size, skip_bias_add=True, init_method=config.output_layer_init_method, - transpose=True) + transpose=True, + bias=args.add_bias_linear + ) def _checkpointed_attention_forward(self, query_layer, key_layer, @@ -573,7 +588,7 @@ def forward(self, hidden_states, attention_mask, if self.attention_type == AttnType.self_attn: # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)] - mixed_x_layer, _ = self.query_key_value(hidden_states, scatter_input=False, gather_output=False, + mixed_x_layer = self.query_key_value(hidden_states, scatter_input=False, gather_output=False, cache_weights_in_all_gather=self.cache_weights_in_all_gather) # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn] @@ -602,7 +617,7 @@ def forward(self, hidden_states, attention_mask, dim=3) # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] - - query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head) + query_layer = query_layer.reshape(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head) else: # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] mixed_kv_layer, _ = self.key_value(encoder_output) @@ -720,7 +735,12 @@ def forward(self, hidden_states, attention_mask, # Output. [sq, b, h] # ================= - output, bias = self.dense(context_layer, scatter_input=False, gather_output=False, cache_weights_in_all_gather=self.cache_weights_in_all_gather) + output = self.dense(context_layer, scatter_input=False, gather_output=False, cache_weights_in_all_gather=self.cache_weights_in_all_gather) + if isinstance(output, tuple): + output, bias = output + else: + output, bias = output, None + torch.cuda.nvtx.range_pop() return output, bias @@ -1675,7 +1695,6 @@ def forward(self, hidden_states, attention_mask, rotary_pos_emb, is_first_microbatch) else: - raise NotImplementedError forward_kwargs = { 'encoder_output': encoder_output, 'enc_dec_attn_mask': enc_dec_attn_mask, diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 33744a2f3a..29fb8e0760 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -from apex.optimizers import FusedAdam as Adam +from apex.optimizers import FusedAdam as ApexAdam +from torch.optim import AdamW as Adam from apex.optimizers import FusedSGD as SGD from megatron import get_args @@ -72,7 +73,14 @@ def get_megatron_optimizer(model, lr_mult) if args.optimizer == 'adam': - optimizer = Adam(param_groups, + if args.use_apex_adam: + optimizer = ApexAdam(param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + betas=(args.adam_beta1, args.adam_beta2), + eps=args.adam_eps) + else: + optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), diff --git a/megatron/training.py b/megatron/training.py index 2dddca9679..0064970a97 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -121,22 +121,27 @@ def pretrain(train_valid_test_dataset_provider, # Data stuff. timers('train/valid/test-data-iterators-setup', log_level=0).start( barrier=True) - if args.virtual_pipeline_model_parallel_size is not None: - all_data_iterators = [ - build_train_valid_test_data_iterators( - train_valid_test_dataset_provider) - for _ in range(len(model)) - ] - train_data_iterator = [data_iterators[0] - for data_iterators in all_data_iterators] - valid_data_iterator = [data_iterators[1] - for data_iterators in all_data_iterators] - test_data_iterator = [data_iterators[2] - for data_iterators in all_data_iterators] + if not args.custom_dataloader: + if args.virtual_pipeline_model_parallel_size is not None: + all_data_iterators = [ + build_train_valid_test_data_iterators( + train_valid_test_dataset_provider) + for _ in range(len(model)) + ] + train_data_iterator = [data_iterators[0] + for data_iterators in all_data_iterators] + valid_data_iterator = [data_iterators[1] + for data_iterators in all_data_iterators] + test_data_iterator = [data_iterators[2] + for data_iterators in all_data_iterators] + else: + train_data_iterator, valid_data_iterator, test_data_iterator \ + = build_train_valid_test_data_iterators( + train_valid_test_dataset_provider) else: - train_data_iterator, valid_data_iterator, test_data_iterator \ - = build_train_valid_test_data_iterators( - train_valid_test_dataset_provider) + assert args.virtual_pipeline_model_parallel_size is None + train_data_iterator, valid_data_iterator = train_valid_test_dataset_provider(0) + test_data_iterator = None timers('train/valid/test-data-iterators-setup').stop() print_datetime('after dataloaders are built') @@ -651,7 +656,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, log_string += ' number of nan iterations: {:3d} |'.format( total_loss_dict[nan_iters_key]) log_string += ' theoretical FLOP/s: {:.3f} TFLOP/s | '.format(get_flops(elapsed_time_per_iteration)) - log_string += ' model size: {:.3f} B params | '.format(get_params()) + #log_string += ' model size: {:.3f} B params | '.format(get_params()) curr, peak = get_mem() log_string += ' memory used by tensors {:.3f} GB ( peak {:.3f} GB)'.format(curr, peak) @@ -677,6 +682,8 @@ def get_flops(batch_time): vocab_size = args.padded_vocab_size num_gpus = torch.distributed.get_world_size() teraflop_in_batch = 96*batch_size*seq_length*num_layers*(hidden_size**2)*(1+seq_length/(6*hidden_size)+(vocab_size)/(16*num_layers*hidden_size))/(1e12) + if args.swiglu: + teraflop_in_batch += (2*batch_size*seq_length*4*(hidden_size**2))*4*num_layers / 1e12 return teraflop_in_batch/batch_time/num_gpus @@ -688,6 +695,8 @@ def get_params(): hidden_size = args.hidden_size vocab_size = args.padded_vocab_size params = 12 * num_layers * (hidden_size ** 2)* ( 1 + 13/(12*hidden_size) + (vocab_size + seq_length)/(12 * num_layers * hidden_size)) / 1e9 + if args.swiglu: + params += num_layers * 4 * hidden_size ** 2 / 1e9 return params def get_mem(): diff --git a/prepare_venv.sh b/prepare_venv.sh new file mode 100755 index 0000000000..80b2aaf17d --- /dev/null +++ b/prepare_venv.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +userid=$(whoami) + +if [ ! -d /mnt/bb/${userid}/axonn_venv ]; then + cp /lustre/orion/scratch/${userid}/csc547/axonn_venv.tar.gz /mnt/bb/${userid}/ + cd /mnt/bb/${userid}/ + tar -xf axonn_venv.tar.gz +fi diff --git a/pretrain_gpt.py b/pretrain_gpt.py index bdc50cc0ac..75757ae5e5 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -1,8 +1,8 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. """Pretrain GPT""" - import os +import time import torch from functools import partial from megatron import get_args @@ -24,6 +24,8 @@ from axonn import axonn as ax from contextlib import nullcontext +from custom_litgpt_dataloader.data_util import create_dataloaders + def model_provider(pre_process=True, post_process=True): """Build the model.""" args = get_args() @@ -54,6 +56,9 @@ def get_batch(data_iterator): data = next(data_iterator) else: data = None + + if args.custom_dataloader: + data = {"text": data} data_b = tensor_parallel.broadcast_data(keys, data, datatype) @@ -67,8 +72,8 @@ def get_batch(data_iterator): attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( tokens, tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, + args.reset_position_ids, # for this to work we need access to the tokenizer + args.reset_attention_mask, # for this to work we need access to the tokenizer args.eod_mask_loss) return tokens, labels, loss_mask, attention_mask, position_ids @@ -114,12 +119,6 @@ def forward_step(data_iterator, model): labels = drop(labels, skip_channels=True) loss_mask = drop(loss_mask, skip_channels=True) position_ids = drop(position_ids, skip_channels=True) - #print(tokens.shape) - #print(labels.shape) - #print(loss_mask.shape) - #print(attention_mask.shape) - #print(position_ids.shape) - #exit() if args.overlap_axonn_comm: ctx = partial(optimize_communication, @@ -137,32 +136,61 @@ def forward_step(data_iterator, model): return output_tensor, partial(loss_func, loss_mask) + + def train_valid_test_datasets_provider(train_val_test_num_samples): """Build train, valid, and test datasets.""" args = get_args() - print_rank_0('> building train, validation, and test datasets ' - 'for GPT ...') - train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=args.data_path, - splits_string=args.split, - train_valid_test_num_samples=train_val_test_num_samples, - seq_length=args.seq_length, - seed=args.seed, - skip_warmup=(not args.mmap_warmup), - train_data_prefix=args.train_data_path, - valid_data_prefix=args.valid_data_path, - test_data_prefix=args.test_data_path, - data_cache_path=args.data_cache_path) - print_rank_0("> finished creating GPT datasets ...") - - return train_ds, valid_ds, test_ds - - -def set_device_and_init_torch_dist(): - from mpi4py import MPI - import os - + if args.custom_dataloader: + train_iterator, valid_iterator = create_dataloaders( + batch_size= args.micro_batch_size, + block_size= args.seq_length, + train_data_dir = args.lit_gpt_data_path, + val_data_dir = args.lit_gpt_data_path, + seed = 12345 + ) + # these flags are set within megatron in + # the OG dataloader + args.do_train = True + args.do_valid = True + args.do_test = False + if args.consumed_train_samples > 0 and train_iterator is not None: + print_rank_0(f"Rewinding dataloader to {args.consumed_train_samples} samples") + train_iterator_consumed_samples = 0 + fake_iters = 0 + start = time.time() + while train_iterator_consumed_samples < args.consumed_train_samples: + next(train_iterator) + train_iterator_consumed_samples += args.global_batch_size + fake_iters += 1 + if fake_iters % args.eval_interval == 0: + for _ in range(args.eval_iters): + next(valid_iterator) + end = time.time() + print_rank_0(f"Time for rewinding the dataloader on rank 0 = {end-start:.2f} s") + + return train_iterator, valid_iterator + else: + print_rank_0('> building train, validation, and test datasets ' + 'for GPT ...') + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + train_data_prefix=args.train_data_path, + valid_data_prefix=args.valid_data_path, + test_data_prefix=args.test_data_path, + data_cache_path=args.data_cache_path) + print_rank_0("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds + + +def set_device_and_init_torch_dist_mpi(): world_rank = MPI.COMM_WORLD.Get_rank() world_size = MPI.COMM_WORLD.Get_size() @@ -187,9 +215,12 @@ def set_device_and_init_torch_dist(): os.environ["WORLD_SIZE"] = str(world_size) + if __name__ == "__main__": - set_device_and_init_torch_dist() + #set_device_and_init_torch_dist_mpi() #torch.cuda.set_per_process_memory_fraction(0.5) # 40GB + # env variables being set in slurm + torch.distributed.init_process_group() pretrain(train_valid_test_datasets_provider, model_provider, ModelType.encoder_or_decoder,