diff --git a/custom_litgpt_dataloader/__init__.py b/custom_litgpt_dataloader/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/custom_litgpt_dataloader/data_util.py b/custom_litgpt_dataloader/data_util.py
new file mode 100644
index 0000000000..36da5ae70c
--- /dev/null
+++ b/custom_litgpt_dataloader/data_util.py
@@ -0,0 +1,96 @@
+from lit_gpt.packed_dataset import CombinedDataset, PackedDataset
+from lit_gpt.utils import CycleIterator
+from torch.utils.data import DataLoader
+from pathlib import Path
+from axonn import axonn as ax
+import torch.distributed as dist
+from typing import Tuple, Union, Optional
+
+data_config = [
+    ("train_slimpajama", 69.3584),
+    ("train_starcoder", 30.6),
+    # 0.693584, 0.306416)
+    # ("c4", 15.0),
+    # ("cc", 67.0),
+    # ("github", 4.5),
+    # ("stackexchange", 2.0),
+    # ("wikipedia", 4.5),
+]
+
+
+def create_dataloader(
+    batch_size: int, block_size: int, data_dir: Path, shuffle: bool = True, seed: int = 12345
+) -> DataLoader:
+    datasets = []
+    for prefix, _ in data_config:
+        filenames = list(data_dir.glob(f"{prefix}*"))
+        if not filenames:
+            raise FileNotFoundError(
+                f"No files found at {str(data_dir)} with prefix {prefix}. Did you forget to run `prepare_redpajama.py`?"
+            )
+        dataset = PackedDataset(
+            filenames,
+            n_chunks=4,
+            block_size=block_size,
+            shuffle=shuffle,
+            seed=seed,
+            num_processes=ax.config.G_data,
+            process_rank=ax.config.data_parallel_rank,
+        )
+        datasets.append(dataset)
+
+    if not datasets:
+        raise RuntimeError(
+            f"No data found at {data_dir}. Make sure you ran prepare_redpajama.py to create the dataset."
+        )
+
+    weights = [weight for _, weight in data_config]
+    sum_weights = sum(weights)
+    weights = [el / sum_weights for el in weights]
+
+    #having different seeds here is important such that each batch has tokens
+    #from all data mixtures.
+    combined_dataset = CombinedDataset(datasets=datasets, seed=seed, weights=weights)
+
+    return DataLoader(combined_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
+
+
+def create_dataloaders(
+    batch_size: int,
+    block_size: int,
+    train_data_dir: str, 
+    val_data_dir: str,
+    seed: int = 12345, #this seed is independent of megatron's seeds
+) -> Tuple[DataLoader, DataLoader]:
+    # Increase by one because we need the next word as well
+    train_data_dir = Path(train_data_dir)
+    val_data_dir = Path(val_data_dir)
+    effective_block_size = block_size + 1
+    train_dataloader = create_dataloader(
+        batch_size=batch_size,
+        block_size=effective_block_size,
+        data_dir=train_data_dir,
+        shuffle=True,
+        seed=seed,
+    )
+    val_dataloader = (
+        create_dataloader(
+            batch_size=batch_size,
+            block_size=effective_block_size,
+            data_dir=val_data_dir,
+            shuffle=False,
+            seed=seed,
+        )
+        if val_data_dir
+        else None
+    )
+    return CycleIterator(train_dataloader), CycleIterator(val_dataloader)
+
+if __name__ == "__main__":
+    ax.init(G_inter=1, G_data=1, G_intra_r=8)
+    train_loader, val_loader = create_dataloaders(
+        batch_size=32,
+        block_size=1024, #sequence length
+    )
+    data = next(train_loader)
+    print(dist.get_rank(), ":", data.view(-1)[:5])
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000000..073b601ea6
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,40 @@
+# How to setup on frontier
+
+## Installing all dependencies
+Note that this is a python virtual environment based setup
+You might need to change this a bit for conda 
+
+Also this assumes that you are starting from scratch and have no venv/conda
+environment enabled.
+
+We are going to install everything on scratch.
+
+```
+cd /lustre/orion/scratch/$(whoami)/csc569/
+bash install_everything_on_frontier.sh
+```
+
+This should work, let Siddharth know if it doesn't
+
+## Training TinyLLaMA
+First checkout the tiny-llama branch of Megatron-AxoNN.
+Then open `examples/run_axonn_amd_tinyllama.sh`, and change the following
+
+```
+# These are the two things you need to change as per your setup
+# 1. Make LD_LIBRARY_PATH point to wherever your plugin is installed
+# this enables the slingshot-11 plugin for RCCL (crucial for inter-node bw)
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/path/to/aws-ofi-rccl/build/lib"
+# 2. Make PYTHONPATH point to your local clone of litgpt
+export PYTHONPATH="$PYTHONPATH:/path/to/lit-gpt-dev"
+```
+
+Now you are ready to train.
+To launch on 16 nodes (128 GPUs) for 2 hours:
+```
+## checkout the tiny-llama branch
+sbatch -N 128 -o /path/to/output/file -t 02:00:00 examples/run_axonn_amd_tinyllama.sh
+``` 
+
+
+
diff --git a/examples/get_rank_from_slurm.sh b/examples/get_rank_from_slurm.sh
new file mode 100755
index 0000000000..881bdc4a0e
--- /dev/null
+++ b/examples/get_rank_from_slurm.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+# select_gpu_device wrapper script
+export RANK=${SLURM_PROCID}
+exec $*
diff --git a/examples/install_everything_on_frontier.sh b/examples/install_everything_on_frontier.sh
new file mode 100644
index 0000000000..ac7359d65e
--- /dev/null
+++ b/examples/install_everything_on_frontier.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Setup Virtual Environment
+echo "Setting up Virtual Environment"
+module load cray-python
+python -m venv ./my-venv --system-site-packages
+cd my-venv
+. bin/activate
+
+# PyTorch
+echo "Installing PyTorch"
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6
+
+module load amd-mixed/5.6.0
+module load PrgEnv-cray
+
+# mpi4py
+echo "Installing mpi4py"
+module load craype-accel-amd-gfx90a
+export MPICH_GPU_SUPPORT_ENABLED=1
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CRAY_MPICH_ROOTDIR}/gtl/lib"
+echo ${LD_LIBRARY_PATH}
+
+MPICC=CC python -m pip install --ignore-installed --no-cache-dir mpi4py
+
+# Flash Attention
+echo "Installing Flash Attention"
+git clone https://github.com/ROCmSoftwarePlatform/flash-attention
+cd flash-attention
+vi setup.py -c ':%s/c++20/c++17/g' -c ':wq'
+CC=cc CXX=CC PYTORCH_ROCM_ARCH='gfx90a' GPU_ARCHS='gfx90a' pip install -v .
+
+# Apex
+echo "Installing Apex"
+cd ..
+git clone https://github.com/ROCmSoftwarePlatform/apex
+cd apex
+git checkout release/1.1.0
+CC=cc CXX=CC PYTORCH_ROCM_ARCH='gfx90a' GPU_ARCHS='gfx90a' python setup.py install --cpp_ext --cuda_ext
+
+# RCCL Plugin
+echo "Installing RCCL Plugin"
+cd ..
+git clone https://github.com/ROCmSoftwarePlatform/aws-ofi-rccl
+cd aws-ofi-rccl
+module load libtool
+./autogen.sh
+CC=cc CXX=CC ./configure --with-libfabric=/opt/cray/libfabric/1.15.0.0 --with-hip=/opt/rocm-5.6.0/ --with-rccl="$(dirname "$(pwd)")"/lib/python3.9/site-packages/torch/lib/ --prefix="$(dirname "$(pwd)")"/aws-ofi-rccl/build/
+CC=cc CXX=CC make -j install
+
+cd ..
+
+# AxoNN
+echo "Installing AxoNN"
+git clone https://github.com/axonn-ai/axonn.git
+cd axonn
+pip install -e .
+
+cd ..
+
+# Megatron-AxoNN
+echo "Installing Megatron-AxoNN"
+git clone https://github.com/axonn-ai/Megatron-AxoNN.git
+
+pip install regex
+
+echo "Done!"
+
diff --git a/examples/run_axonn_amd.sh b/examples/run_axonn_amd.sh
deleted file mode 100755
index ec3bf26923..0000000000
--- a/examples/run_axonn_amd.sh
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/bin/bash
-
-# Runs the "345M" parameter model
-
-module load cray-python
-. /lustre/orion/scratch/ssingh37/csc547/venv_axonn_pt_2.1/bin/activate
-module load amd-mixed/5.6.0 #this should match with the rocm version your pytorch uses
-
-## these lines enable CUDA aware MPI
-module load craype-accel-amd-gfx90a
-export MPICH_GPU_SUPPORT_ENABLED=0
-export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CRAY_MPICH_ROOTDIR}/gtl/lib"
-
-## this enables the slingshot-11 plugin for RCCL (crucial for inter-node bw)
-export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/lustre/orion/scratch/ssingh37/csc547/aws-ofi-rccl/build/lib"
-#export NCCL_DEBUG=INFO
-export FI_CXI_ATS=0
-
-## this improves cross node bandwidth for some cases
-export NCCL_CROSS_NIC=1
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS_PER_NODE=8 ## change as per your machine
-GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-
-# data/checkpoint args
-DATA_DIR="/lustre/orion/csc547/proj-shared/parallel_deep_learning/book_corpus"
-
-
-CHECKPOINT_PATH="${DATA_DIR}/checkpoints"
-VOCAB_FILE="${DATA_DIR}/gpt2-vocab.json"
-MERGE_FILE="${DATA_DIR}/gpt2-merges.txt"
-DATA_PATH="${DATA_DIR}/BookCorpusDataset_text_document"
-
-## ARCHITECTURE DETAILS
-# 20B
-NUM_LAYERS=32
-HIDDEN_SIZE=7168
-NUM_HEADS=56
-
-# 40B
-NUM_LAYERS=38
-HIDDEN_SIZE=9216
-NUM_HEADS=72
-
-## PARALLELISM DETAILS
-COLUMN_TENSOR_PARR=1
-ROW_TENSOR_PARR=2
-DEPTH_TENSOR_PARR=256
-PIPE_PARR=1
-CACHE_LAYERS=25
-OVERLAP=True
-
-## BATCH SIZES
-MICRO_BATCH_SIZE=2048
-GLOBAL_BATCH_SIZE=2048
-SEQUENCE_LENGTH=2048
-TRAIN_ITERS=10
-
-GPT_ARGS="
-    --row-tensor-model-parallel-size ${ROW_TENSOR_PARR} \
-    --column-tensor-model-parallel-size ${COLUMN_TENSOR_PARR} \
-    --depth-tensor-model-parallel-size ${DEPTH_TENSOR_PARR} \
-    --pipeline-model-parallel-size ${PIPE_PARR} \
-    --num-layers ${NUM_LAYERS} \
-    --hidden-size ${HIDDEN_SIZE} \
-    --num-attention-heads ${NUM_HEADS} \
-    --seq-length ${SEQUENCE_LENGTH} \
-    --max-position-embeddings ${SEQUENCE_LENGTH} \
-    --micro-batch-size ${MICRO_BATCH_SIZE} \
-    --global-batch-size ${GLOBAL_BATCH_SIZE} \
-    --lr 0.00015 \
-    --train-iters ${TRAIN_ITERS} \
-    --lr-decay-iters 320000 \
-    --lr-decay-style cosine \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --bf16 \
-    --no-gradient-accumulation-fusion \
-    --use-amd \
-    --recompute-granularity full \
-    --recompute-method uniform \
-    --recompute-num-layers 1 \
-    --use-flash-attn \
-"
-# --no-gradient-accumulation-fusion is neede on AMD
-# --use-amd disables features incompatible with AMD
-
-
-if [[ $OVERLAP == "True" ]]
-then
-	GPT_ARGS="${GPT_ARGS} \
-		--overlap-axonn-comm \
-		--overlap-axonn-reduce-scatter \
-		--overlap-axonn-all-gather\
-		--num-layers-for-caching-weights-in-depth-tensor-parallel-all-gather ${CACHE_LAYERS}"
-fi
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 1 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 1
-"
-
-SCRIPT="python -u pretrain_gpt.py \
-    $GPT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-"
-
-    #--save $CHECKPOINT_PATH \
-    #--load $CHECKPOINT_PATH
-
-
-export OMP_NUM_THREADS=7 
-run_cmd="srun -N ${NNODES} -n ${GPUS} -c7 --gpus-per-task=1 --gpu-bind=closest ${SCRIPT}" 
-
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/examples/run_axonn_amd_tinyllama.sh b/examples/run_axonn_amd_tinyllama.sh
new file mode 100755
index 0000000000..e7a491b23d
--- /dev/null
+++ b/examples/run_axonn_amd_tinyllama.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -C nvme
+
+# tiny_llama = [
+#     dict(
+#         name="tiny-llama-1.1b{}",
+#         hf_config=dict(org="TinyLlama", name="TinyLlama-1.1B{}"),
+#         block_size=2048, #### CHECKED (Seq Length)
+#         vocab_size=32000, #### NEED TO ADD VOCAB FILES
+#         padding_multiple=64, ### NOT RELEVANT
+#         n_layer=22, ### CHECKED (NUM_LAYERS)
+#         n_head=32, ### CHECKED (NUM_HEADS)	
+#         n_embd=2048, ### CHECKED (HIDDEN_SIZE)
+#         rotary_percentage=1.0, ### TODO: CHECK IF -- USING ROTARY (I think the default is 1: https://github.com/search?q=repo%3Aaxonn-ai%2FMegatron-AxoNN+rotary_percent&type=code)
+#         parallel_residual=False, ### TODO: CHECK (https://github.com/azshue/lit-gpt-dev/blob/99fb9363646bfacb686f72f58274392e6036ad6c/lit_gpt/model.py#L157 and apply_residual_connection_post_layernorm are the same)
+#         bias=False, ### TODO: CHECK "disable-bias-linear" I think. This is the bias for the linear layers
+#         _norm_class="RMSNorm",  ### CHECKED "--normalization RMSNorm"
+#         norm_eps=1e-5, ### TODO: UNLCLEAR WHERE THIS IS -- I think this is fine (https://github.com/search?q=repo%3Aaxonn-ai%2FMegatron-AxoNN%20norm_eps&type=code)
+#         _mlp_class="LLaMAMLP", ### CHECKED "From Line 112, # --swiglu makes ParallelMLP equivalent to LLAMAMLP"
+#         intermediate_size=5632, ### CHECKED "FFN_HIDDEN_SIZE"
+#         n_query_groups=4, #### CHECKED: NUM_QUERY_GROUPS
+#     )
+# ]
+### WE want global batch size of 4M so 4000000/2048
+#### We are gonna copy Olma's BS of 4M
+# global_batch_size = 2048 #NEEL: UPDATED IN BASH SCRIPT
+# learning_rate = 4e-4 #NEEL: Checked "--lr 4.0e-4"
+#### THIS COULD BE SET ACCORDING TO HOW MANY GPUs we want to use
+# micro_batch_size = 8
+# max_tokens = int(1e12)  #NEEL: UPDATED IN BASH SCRIPT
+# warmup_steps = 2000 # We are gonna use tinyllama warmup steps
+#### BELOW ARE IRRELVANT ####
+# log_step_interval = 1
+# eval_iters = 100
+# save_step_interval = 1000
+# eval_step_interval = 1000
+#### ABOVE ARE IRRELVANT ####
+
+# weight_decay = 1e-1  ### Neel: CHECKED "weight-decay 1e-1"
+# beta1 = 0.9 ### Neel: CHECKED
+# beta2 = 0.95 ### Neel: CHECKED
+# grad_clip = 1.0 ### Neel: CHECKED 
+# decay_lr = True <--- This is irrevalant
+# min_lr = 4e-5 ### Neel: CHECKED 
+
+## calculating the number of nodes and GPUs
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS_PER_NODE=8 ## change as per your machine
+GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+userid=$(whoami)
+# These are the two things you need to change as per your setup
+# 1. Make LD_LIBRARY_PATH point to wherever your plugin is installed
+# this enables the slingshot-11 plugin for RCCL (crucial for inter-node bw)
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/ccs/home/$userid/aws-ofi-rccl/build/lib"
+# 2. Make PYTHONPATH point to your local clone of litgpt
+export PYTHONPATH="$PYTHONPATH:/lustre/orion/scratch/$userid/csc547/lit-gpt-dev"
+
+
+# This blob is setting up my python venv, ignore for conda builds
+echo "moving environment to burst buffer"
+## load venv onto burst buffer
+srun -N $NNODES --ntasks-per-node=1 prepare_venv.sh
+## delete old symbolic link
+rm -rf ~/axonn_venv
+## create new symbolic link
+ln -s /mnt/bb/ssingh37/axonn_venv ~/axonn_venv
+module load PrgEnv-cray
+module load cray-python/3.9.13.1
+. /ccs/home/$userid/axonn_venv/bin/activate
+
+
+module load amd-mixed/5.6.0 #this should match with the rocm version your pytorch uses
+module load libfabric
+
+export MPICH_GPU_SUPPORT_ENABLED=0
+
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_CROSS_NIC=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=${GPUS}
+
+# train_data_dir and val_data_dir are set to this as of now
+DATADIR="/lustre/orion/csc569/proj-shared/language_datasets/"
+DATASET="spj_star_combined_full_tinyllama_tokd"
+DATAPATH="$DATADIR/$DATASET"
+
+TOKENIZER_DIR="/lustre/orion/csc569/proj-shared/megatron-axonn-tiny-llama-1.1b/llama-tokenizer"
+TOKENIZER_MODEL="${TOKENIZER_DIR}/tokenizer.model"
+
+# we will save and load model checkpoints here
+# if these are non-empty training will restart from the latest checkpoint here
+# else training will start from scratch
+CHECKPOINT_PATH="/lustre/orion/csc569/proj-shared/megatron-axonn-tiny-llama-1.1b/checkpoints/dataloader_correction"
+
+# tiny-llama1.1B architecture shapes
+# https://github.com/azshue/lit-gpt-dev/blob/tiny-llama/lit_gpt/config.py
+NUM_LAYERS=22
+NUM_HEADS=32
+HIDDEN_SIZE=2048	
+FFN_HIDDEN_SIZE=5632
+NUM_QUERY_GROUPS=4
+
+# batch size, seq length, and iterations
+GLOBAL_BATCH_SIZE=2048 ## Neel: 2048x2048 = 4M per batch
+SEQUENCE_LENGTH=2048
+TOKENS_IN_BILLIONS=1000 ### Neel: Changed 1T #####
+TRAIN_ITERS=$(( TOKENS_IN_BILLIONS * 1000000000 / GLOBAL_BATCH_SIZE / SEQUENCE_LENGTH  + 100 )) 
+echo "Number of training iterations : ${TRAIN_ITERS}"
+
+## AxoNN parallelism args
+## These do not affect the science
+ROW_TENSOR_PARR=1
+COLUMN_TENSOR_PARR=1
+DEPTH_TENSOR_PARR=1
+PIPE_PARR=1
+CACHE_LAYERS=0
+OVERLAP=True
+
+
+GRAD_ACC=2
+GRADIENT_CHECKPOINT=False
+
+## DERIVED ARGUMENTS (ignore)
+MP=$(( ROW_TENSOR_PARR * COLUMN_TENSOR_PARR * DEPTH_TENSOR_PARR ))
+DP=$(( GPUS / MP ))
+MICRO_BATCH_SIZE=$(( GLOBAL_BATCH_SIZE / DP / GRAD_ACC ))
+
+# The following args enable LLaMA
+# --swiglu makes ParallelMLP equivalent to LLAMAMLP
+# --group-query-attention - enables group query attention
+# --num-query-groups - number of query groups for group query attention
+# --normalization RMSNorm - switch from layernorm to RMSNorm (someone confirm?)
+# --use-rotary-position-embeddings - use RoPE embeddings instead of learned position embeddings
+# --untie-embeddings-and-output-weights - untie embedding and last layer weights
+# --disable-bias-linear - disables bias in all nn.linear layers
+
+# The following args disable features not compatible with AMD
+# --no-gradient-accumulation-fusion 
+# --use-amd 
+
+GPT_ARGS="
+    --row-tensor-model-parallel-size ${ROW_TENSOR_PARR} \
+    --column-tensor-model-parallel-size ${COLUMN_TENSOR_PARR} \
+    --depth-tensor-model-parallel-size ${DEPTH_TENSOR_PARR} \
+    --pipeline-model-parallel-size ${PIPE_PARR} \
+    --num-layers ${NUM_LAYERS} \
+    --hidden-size ${HIDDEN_SIZE} \
+    --num-attention-heads ${NUM_HEADS} \
+    --ffn-hidden-size ${FFN_HIDDEN_SIZE} \
+    --seq-length ${SEQUENCE_LENGTH} \
+    --max-position-embeddings ${SEQUENCE_LENGTH} \
+    --micro-batch-size ${MICRO_BATCH_SIZE} \
+    --global-batch-size ${GLOBAL_BATCH_SIZE} \
+    --lr 4.0e-4 \
+    --train-iters ${TRAIN_ITERS} \
+    --lr-decay-iters ${TRAIN_ITERS} \
+    --lr-decay-style cosine \
+    --min-lr 4.0e-5 \
+    --weight-decay 1e-1 \
+    --lr-warmup-iters 2000 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --bf16 \
+    --no-gradient-accumulation-fusion \
+    --use-amd \
+    --use-flash-attn \
+    --swiglu \
+    --use-rotary-position-embeddings \
+    --normalization RMSNorm \
+    --group-query-attention \
+    --num-query-groups ${NUM_QUERY_GROUPS} \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --use-apex-adam \
+    --seed 78965 \
+    --attention-dropout 0 \
+    --hidden-dropout 0
+"
+
+if [[ $GRADIENT_CHECKPOINT == "True" ]]
+then
+    GPT_ARGS="${GPT_ARGS} --recompute-granularity full \
+    			  --recompute-method uniform \
+    			  --recompute-num-layers 1"
+fi
+
+## AxoNN specific args for communication optimizations
+# these do not affect the ML science
+if [[ $OVERLAP == "True" ]]
+then
+	GPT_ARGS="${GPT_ARGS} \
+		--overlap-axonn-comm \
+		--overlap-axonn-reduce-scatter \
+		--overlap-axonn-all-gather\
+		--num-layers-for-caching-weights-in-depth-tensor-parallel-all-gather ${CACHE_LAYERS}"
+fi
+
+# --lit-gpt-data-path - is pointing to your dataset
+# currently both train and val splits are taken fron --data-path
+# the --custom-dataloader argument bypasses megatron's dataloaders
+# --num-workers 0 - disables multiprocesses dataloading 
+# which can hang jobs at scale
+
+DATA_ARGS="
+    --lit-gpt-data-path $DATAPATH \
+    --custom-dataloader \
+    --num-workers 0 \
+    --tokenizer-type Llama2Tokenizer \
+    --tokenizer-model ${TOKENIZER_MODEL}
+"
+
+# --eval-interval 1000 - do validation after every 1000 arguments
+# --eval-iters 100 - do validation for 100 iterations
+# --save-interval 1000 - save the model after every 1000 iterations
+# --log-interval 1 - print iteration lossees after every 1 iteration
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 1000 \
+    --eval-interval 1000 \
+    --eval-iters 100 \
+"
+
+SCRIPT="python -u pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
+"
+
+
+export OMP_NUM_THREADS=7 
+run_cmd="srun -N ${NNODES} -n ${GPUS} -c7 --gpus-per-task=1 --gpu-bind=closest ./examples/get_rank_from_slurm.sh ${SCRIPT}" 
+
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 7f975bad14..ef92ffd273 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -422,6 +422,12 @@ def core_transformer_config_from_args(args):
     if args.init_method_xavier_uniform:
         kw_args['init_method'] = torch.nn.init.xavier_uniform_
         kw_args['scaled_init_method'] = torch.nn.init.xavier_uniform_
+    elif args.init_method_tiny_llama:
+        from megatron.core.utils import init_method_normal
+        import math
+        kw_args['init_method'] = init_method_normal(math.sqrt(2.0 / 5 / args.hidden_size))
+        kw_args['output_layer_init_method'] = init_method_normal(1 / math.sqrt(args.hidden_size) / args.num_layers )
+
     if args.group_query_attention:
         kw_args['num_query_groups'] = args.num_query_groups
     else:
@@ -699,6 +705,8 @@ def _add_regularization_args(parser):
                        'numerical stability')
     group.add_argument('--sgd-momentum', type=float, default=0.9,
                        help='Momentum factor for sgd')
+    group.add_argument('--use-apex-adam', action='store_true', default=False,
+                       help="Use Apex's implementation of Adam")
 
     return parser
 
@@ -861,6 +869,8 @@ def _add_initialization_args(parser):
                        'distribution used for weight initialization.')
     group.add_argument('--init-method-xavier-uniform', action='store_true',
                        help='Enable Xavier uniform parameter initialization')
+    group.add_argument('--init-method-tiny-llama', action='store_true',
+                       help='Enable Tiny LLaMA based initialization')
 
     return parser
 
@@ -1081,6 +1091,8 @@ def _add_validation_args(parser):
 def _add_data_args(parser):
     group = parser.add_argument_group(title='data and dataloader')
 
+    group.add_argument('--custom-dataloader', help="using custom dataloader, bypass megatron's"
+                       "dataset/dataloader creation", action='store_true')
     group.add_argument('--data-path', nargs='*', default=None,
                        help='Path to the training dataset. Accepted format:'
                        '1) a single data path, 2) multiple datasets in the'
@@ -1162,6 +1174,9 @@ def _add_data_args(parser):
     group.add_argument('--eod-mask-loss', action='store_true',
                        help='Mask loss for the end of document tokens.')
 
+    ## add separate argument for lit gpt data paths
+    group.add_argument('--lit-gpt-data-path', type=str,
+                       help="data path for custom lit gpt dataloaders")
     return parser
 
 
diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index b09879bd83..052b014f21 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -582,6 +582,8 @@ def __init__(
         keep_master_weight_for_test=False,
         skip_bias_add=False,
         skip_weight_param_allocation: bool = False,
+        for_embedding_and_clf_layer: bool = False
+
     ):
         super(ColumnParallelLinear, self).__init__()
 
@@ -590,10 +592,11 @@ def __init__(
         self.output_size = output_size
         self.gather_output = gather_output
         # Divide the weight matrix along the last dimension.
-        world_size = get_tensor_model_parallel_world_size()
+        world_size = get_tensor_model_parallel_world_size(for_embedding_and_clf_layer=for_embedding_and_clf_layer)
         self.output_size_per_partition = divide(output_size, world_size)
         self.skip_bias_add = skip_bias_add
         self.config = config
+        self.for_embedding_and_clf_layer = for_embedding_and_clf_layer
 
         # Parameters.
         # Note: torch.nn.functional.linear performs XA^T + b and as a result
@@ -601,6 +604,7 @@ def __init__(
         # Initialize weight.
         if not skip_weight_param_allocation:
             if config.use_cpu_initialization:
+                raise NotImplementedError
                 self.weight = Parameter(
                     torch.empty(
                         self.output_size_per_partition, self.input_size, dtype=config.params_dtype
@@ -628,7 +632,8 @@ def __init__(
                 )
                 if config.perform_initialization:
                     _initialize_affine_weight_gpu(
-                        self.weight, init_method, partition_dim=0, stride=stride
+                        self.weight, init_method, partition_dim=0, stride=stride, 
+                        for_embedding_and_clf_layer=self.for_embedding_and_clf_layer
                     )
         else:
             self.weight = None
@@ -724,6 +729,7 @@ def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None):
             input_parallel = copy_to_tensor_model_parallel_region(input_)
         # Matrix multiply.
         if not weight.requires_grad:
+            raise NotImplementedError
             self._forward_impl = linear_with_frozen_weight
         else:
             self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
@@ -734,6 +740,7 @@ def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None):
             gradient_accumulation_fusion=self.gradient_accumulation_fusion,
             async_grad_allreduce=self.async_tensor_model_parallel_allreduce,
             sequence_parallel=self.sequence_parallel,
+            for_embedding_and_clf_layer=self.for_embedding_and_clf_layer
         )
         if self.gather_output:
             # All-gather across the partitions.
diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py
index bb1af9b9e7..20d7bfde4c 100644
--- a/megatron/model/gpt_model.py
+++ b/megatron/model/gpt_model.py
@@ -58,7 +58,6 @@ def __init__(self,
         self.post_process = post_process
         self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
         self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights
-        assert not self.untie_embeddings_and_output_weights, "Megatron-AxoNN doesn't support untied embedding yet"
 
         self.language_model, self._language_model_key = get_language_model(
             config=config,
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 37e225a54f..72b0e1ec85 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -95,6 +95,7 @@ def __init__(self, config, layer_number):
                     out_features=ffn_hidden_size,
                     skip_bias_add=True,
                     init_method=config.init_method,
+                    bias = self.add_bias
                 )
 
         self.bias_gelu_fusion = False
@@ -125,13 +126,19 @@ def squared_relu(x):
             skip_bias_add=True,
             init_method=config.output_layer_init_method,
             transpose=True,
+            bias = self.add_bias
         )
 
     def forward(self, hidden_states):
         torch.cuda.nvtx.range_push(f"MLP Block")
         # [s, b, 4hp]
-        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states, scatter_input=False, gather_output=False, 
-                                                                  cache_weights_in_all_gather = self.cache_weights_in_all_gather)
+        output = self.dense_h_to_4h(hidden_states, scatter_input=False, gather_output=False, 
+                                    cache_weights_in_all_gather = self.cache_weights_in_all_gather)
+
+        if isinstance(output, tuple):
+            intermediate_parallel, bias_parallel = output
+        else:
+            intermediate_parallel, bias_parallel = output, None
 
         if self.bias_gelu_fusion:
             assert self.add_bias is True
@@ -143,8 +150,14 @@ def forward(self, hidden_states):
             intermediate_parallel = self.activation_func(intermediate_parallel)
 
         # [s, b, h]
-        output, output_bias = self.dense_4h_to_h(intermediate_parallel, scatter_input=False, gather_output=False, 
+        output = self.dense_4h_to_h(intermediate_parallel, scatter_input=False, gather_output=False, 
                                                                   cache_weights_in_all_gather = self.cache_weights_in_all_gather)
+        
+        if isinstance(output, tuple):
+            output, output_bias = output
+        else:
+            output, output_bias = output, None
+
         torch.cuda.nvtx.range_pop()
         return output, output_bias
 
@@ -463,8 +476,8 @@ def __init__(self, config, layer_number,
             self.query_key_value = Linear(
                     in_features=config.hidden_size,
                     out_features=query_projection_size + 2 * kv_projection_size,
-                    skip_bias_add=True,
-                    init_method=config.init_method)
+                    init_method=config.init_method,
+                    bias=args.add_bias_linear)
         else:
             raise NotImplementedError
             assert attention_type == AttnType.cross_attn
@@ -504,7 +517,9 @@ def __init__(self, config, layer_number,
                 out_features=config.hidden_size,
                 skip_bias_add=True,
                 init_method=config.output_layer_init_method,
-                transpose=True)
+                transpose=True,
+                bias=args.add_bias_linear
+                )
 
 
     def _checkpointed_attention_forward(self, query_layer, key_layer,
@@ -573,7 +588,7 @@ def forward(self, hidden_states, attention_mask,
         if self.attention_type == AttnType.self_attn:
 
             # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
-            mixed_x_layer, _ = self.query_key_value(hidden_states, scatter_input=False, gather_output=False, 
+            mixed_x_layer = self.query_key_value(hidden_states, scatter_input=False, gather_output=False, 
                                                     cache_weights_in_all_gather=self.cache_weights_in_all_gather)
 
             # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
@@ -602,7 +617,7 @@ def forward(self, hidden_states, attention_mask,
                 dim=3)
 
             # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] -
-            query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
+            query_layer = query_layer.reshape(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
         else:
             # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
             mixed_kv_layer, _ = self.key_value(encoder_output)
@@ -720,7 +735,12 @@ def forward(self, hidden_states, attention_mask,
         # Output. [sq, b, h]
         # =================
 
-        output, bias = self.dense(context_layer, scatter_input=False, gather_output=False, cache_weights_in_all_gather=self.cache_weights_in_all_gather)
+        output = self.dense(context_layer, scatter_input=False, gather_output=False, cache_weights_in_all_gather=self.cache_weights_in_all_gather)
+        if isinstance(output, tuple):
+            output, bias = output
+        else:
+            output, bias = output, None
+
         torch.cuda.nvtx.range_pop()
         return output, bias
 
@@ -1675,7 +1695,6 @@ def forward(self, hidden_states, attention_mask,
                                                                rotary_pos_emb,
                                                                is_first_microbatch)
                 else:
-                    raise NotImplementedError
                     forward_kwargs = {
                         'encoder_output': encoder_output,
                         'enc_dec_attn_mask': enc_dec_attn_mask,
diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py
index 33744a2f3a..29fb8e0760 100644
--- a/megatron/optimizer/__init__.py
+++ b/megatron/optimizer/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 
-from apex.optimizers import FusedAdam as Adam
+from apex.optimizers import FusedAdam as ApexAdam
+from torch.optim import AdamW as Adam
 from apex.optimizers import FusedSGD as SGD
 
 from megatron import get_args
@@ -72,7 +73,14 @@ def get_megatron_optimizer(model,
                                     lr_mult)
 
     if args.optimizer == 'adam':
-        optimizer = Adam(param_groups,
+        if args.use_apex_adam:
+            optimizer = ApexAdam(param_groups,
+                         lr=args.lr,
+                         weight_decay=args.weight_decay,
+                         betas=(args.adam_beta1, args.adam_beta2),
+                         eps=args.adam_eps)
+        else:
+            optimizer = Adam(param_groups,
                          lr=args.lr,
                          weight_decay=args.weight_decay,
                          betas=(args.adam_beta1, args.adam_beta2),
diff --git a/megatron/training.py b/megatron/training.py
index 2dddca9679..0064970a97 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -121,22 +121,27 @@ def pretrain(train_valid_test_dataset_provider,
     # Data stuff.
     timers('train/valid/test-data-iterators-setup', log_level=0).start(
         barrier=True)
-    if args.virtual_pipeline_model_parallel_size is not None:
-        all_data_iterators = [
-            build_train_valid_test_data_iterators(
-                train_valid_test_dataset_provider)
-            for _ in range(len(model))
-        ]
-        train_data_iterator = [data_iterators[0]
-                               for data_iterators in all_data_iterators]
-        valid_data_iterator = [data_iterators[1]
-                               for data_iterators in all_data_iterators]
-        test_data_iterator = [data_iterators[2]
-                              for data_iterators in all_data_iterators]
+    if not args.custom_dataloader:
+        if args.virtual_pipeline_model_parallel_size is not None:
+            all_data_iterators = [
+                build_train_valid_test_data_iterators(
+                    train_valid_test_dataset_provider)
+                for _ in range(len(model))
+            ]
+            train_data_iterator = [data_iterators[0]
+                                   for data_iterators in all_data_iterators]
+            valid_data_iterator = [data_iterators[1]
+                                   for data_iterators in all_data_iterators]
+            test_data_iterator = [data_iterators[2]
+                                  for data_iterators in all_data_iterators]
+        else:
+            train_data_iterator, valid_data_iterator, test_data_iterator \
+                = build_train_valid_test_data_iterators(
+                    train_valid_test_dataset_provider)
     else:
-        train_data_iterator, valid_data_iterator, test_data_iterator \
-            = build_train_valid_test_data_iterators(
-                train_valid_test_dataset_provider)
+        assert args.virtual_pipeline_model_parallel_size is None
+        train_data_iterator, valid_data_iterator = train_valid_test_dataset_provider(0)
+        test_data_iterator = None
     timers('train/valid/test-data-iterators-setup').stop()
     print_datetime('after dataloaders are built')
 
@@ -651,7 +656,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
         log_string += ' number of nan iterations: {:3d} |'.format(
             total_loss_dict[nan_iters_key])
         log_string += ' theoretical FLOP/s: {:.3f} TFLOP/s | '.format(get_flops(elapsed_time_per_iteration))
-        log_string += ' model size: {:.3f} B params | '.format(get_params())
+        #log_string += ' model size: {:.3f} B params | '.format(get_params())
         curr, peak = get_mem()
         log_string += ' memory used by tensors {:.3f} GB ( peak {:.3f} GB)'.format(curr, peak)
 
@@ -677,6 +682,8 @@ def get_flops(batch_time):
     vocab_size = args.padded_vocab_size
     num_gpus = torch.distributed.get_world_size()
     teraflop_in_batch = 96*batch_size*seq_length*num_layers*(hidden_size**2)*(1+seq_length/(6*hidden_size)+(vocab_size)/(16*num_layers*hidden_size))/(1e12)
+    if args.swiglu:
+        teraflop_in_batch += (2*batch_size*seq_length*4*(hidden_size**2))*4*num_layers / 1e12
     return teraflop_in_batch/batch_time/num_gpus
 
 
@@ -688,6 +695,8 @@ def get_params():
     hidden_size = args.hidden_size
     vocab_size = args.padded_vocab_size
     params = 12 * num_layers * (hidden_size ** 2)* ( 1 + 13/(12*hidden_size) + (vocab_size + seq_length)/(12 * num_layers * hidden_size)) / 1e9
+    if args.swiglu:
+        params += num_layers * 4 * hidden_size ** 2 / 1e9
     return params
    
 def get_mem():
diff --git a/prepare_venv.sh b/prepare_venv.sh
new file mode 100755
index 0000000000..80b2aaf17d
--- /dev/null
+++ b/prepare_venv.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+userid=$(whoami)
+
+if [ ! -d /mnt/bb/${userid}/axonn_venv ]; then
+	cp /lustre/orion/scratch/${userid}/csc547/axonn_venv.tar.gz /mnt/bb/${userid}/
+	cd /mnt/bb/${userid}/
+	tar -xf axonn_venv.tar.gz
+fi
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index bdc50cc0ac..75757ae5e5 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 
 """Pretrain GPT"""
-
 import os
+import time
 import torch
 from functools import partial
 from megatron import get_args
@@ -24,6 +24,8 @@
 from axonn import axonn as ax
 from contextlib import nullcontext
 
+from custom_litgpt_dataloader.data_util import create_dataloaders
+
 def model_provider(pre_process=True, post_process=True):
     """Build the model."""
     args = get_args()
@@ -54,6 +56,9 @@ def get_batch(data_iterator):
         data = next(data_iterator)
     else:
         data = None
+    
+    if args.custom_dataloader:
+        data = {"text": data}
 
     data_b = tensor_parallel.broadcast_data(keys, data, datatype)
     
@@ -67,8 +72,8 @@ def get_batch(data_iterator):
     attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
         tokens,
         tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
+        args.reset_position_ids, # for this to work we need access to the tokenizer
+        args.reset_attention_mask, # for this to work we need access to the tokenizer
         args.eod_mask_loss)
 
     return tokens, labels, loss_mask, attention_mask, position_ids
@@ -114,12 +119,6 @@ def forward_step(data_iterator, model):
     labels = drop(labels, skip_channels=True)
     loss_mask = drop(loss_mask, skip_channels=True)
     position_ids = drop(position_ids, skip_channels=True)
-        #print(tokens.shape)
-        #print(labels.shape)
-        #print(loss_mask.shape)
-        #print(attention_mask.shape)
-        #print(position_ids.shape)
-        #exit()
     
     if args.overlap_axonn_comm:
         ctx = partial(optimize_communication, 
@@ -137,32 +136,61 @@ def forward_step(data_iterator, model):
     return output_tensor, partial(loss_func, loss_mask)
 
 
+
+
 def train_valid_test_datasets_provider(train_val_test_num_samples):
     """Build train, valid, and test datasets."""
     args = get_args()
 
-    print_rank_0('> building train, validation, and test datasets '
-                 'for GPT ...')
-    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        splits_string=args.split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        seq_length=args.seq_length,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup),
-        train_data_prefix=args.train_data_path,
-        valid_data_prefix=args.valid_data_path,
-        test_data_prefix=args.test_data_path,
-        data_cache_path=args.data_cache_path)
-    print_rank_0("> finished creating GPT datasets ...")
-
-    return train_ds, valid_ds, test_ds
-
-
-def set_device_and_init_torch_dist():
-    from mpi4py import MPI
-    import os
-
+    if args.custom_dataloader:
+        train_iterator, valid_iterator = create_dataloaders(
+            batch_size= args.micro_batch_size,
+            block_size= args.seq_length,
+            train_data_dir = args.lit_gpt_data_path,
+            val_data_dir = args.lit_gpt_data_path,
+            seed = 12345
+        )
+        # these flags are set within megatron in 
+        # the OG dataloader
+        args.do_train = True
+        args.do_valid = True
+        args.do_test = False
+        if args.consumed_train_samples > 0 and train_iterator is not None:
+            print_rank_0(f"Rewinding dataloader to {args.consumed_train_samples} samples")
+            train_iterator_consumed_samples = 0
+            fake_iters = 0
+            start = time.time()
+            while train_iterator_consumed_samples < args.consumed_train_samples:
+                next(train_iterator)
+                train_iterator_consumed_samples += args.global_batch_size 
+                fake_iters += 1
+                if fake_iters % args.eval_interval == 0:
+                    for _ in range(args.eval_iters):
+                        next(valid_iterator)
+            end = time.time()
+            print_rank_0(f"Time for rewinding the dataloader on rank 0 = {end-start:.2f} s")
+        
+        return train_iterator, valid_iterator
+    else:
+        print_rank_0('> building train, validation, and test datasets '
+                     'for GPT ...')
+        train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+            data_prefix=args.data_path,
+            splits_string=args.split,
+            train_valid_test_num_samples=train_val_test_num_samples,
+            seq_length=args.seq_length,
+            seed=args.seed,
+            skip_warmup=(not args.mmap_warmup),
+            train_data_prefix=args.train_data_path,
+            valid_data_prefix=args.valid_data_path,
+            test_data_prefix=args.test_data_path,
+            data_cache_path=args.data_cache_path)
+        print_rank_0("> finished creating GPT datasets ...")
+
+        return train_ds, valid_ds, test_ds
+
+
+def set_device_and_init_torch_dist_mpi():
     world_rank = MPI.COMM_WORLD.Get_rank()
     world_size = MPI.COMM_WORLD.Get_size()
 
@@ -187,9 +215,12 @@ def set_device_and_init_torch_dist():
     os.environ["WORLD_SIZE"] = str(world_size)
 
 
+
 if __name__ == "__main__":
-    set_device_and_init_torch_dist()
+    #set_device_and_init_torch_dist_mpi()
     #torch.cuda.set_per_process_memory_fraction(0.5) # 40GB
+    # env variables being set in slurm
+    torch.distributed.init_process_group()
     pretrain(train_valid_test_datasets_provider,
              model_provider,
              ModelType.encoder_or_decoder,