Engine: Improve output structure, CLI: Configurable model options, Se…

…parate `finetune`+`generate-adapter` (#1361) ## Describe your changes **Engine**: - Improve the output folder structure of a workflow run. The current structure was meant for multiple-ep, multiple-passflow worfklows but that is not the common usage for olive. - Unnecessary nesting for accelerator spec and pass flows is removed for single ep, single passflow scenario. - `output_name` is removed from both pass config and engine config. - The behavior of `output_name` is arbitrary. User can get the output in a specific folder by directly providing the `output_dir` like `parent-dir/specific-dir`. - `output_name` was allowed for pass config to save intermediate models. But this can be achieved by providing multiple pass flows like `[[A, B], [A, B, C]]`. This is cleaner than the former. - Refer to `Engine.run` for more details on the new output structure. **CLI**: - `add_model_options` is made configurable so that only the desired model type related options are added. - `save_output_model` uses the new engine output directory structure to copy the output model into the final output directory. - `finetune` command separated into `finetune` and `generate-adapter` commands. These commands can be chained as shown in the llama2 multilora notebook. ## Checklist before requesting a review - [x] Add unit tests for this change. - [x] Make sure all tests can pass. - [x] Update documents if necessary. - [x] Lint and apply fixes to your code by running `lintrunner -a` - [ ] Is this a user-facing change? If yes, give a description of this change to be included in the release notes. - [ ] Is this PR including examples changes? If yes, please remember to update [example documentation](https://github.com/microsoft/Olive/blob/main/docs/source/examples.md) in a follow-up PR. ## (Optional) Issue link
microsoft · Sep 18, 2024 · 9301aae · 9301aae
1 parent 9fa2604
commit 9301aae
Show file tree

Hide file tree

Showing 55 changed files with 693 additions and 587 deletions.
diff --git a/docs/source/overview/options.md b/docs/source/overview/options.md
@@ -357,10 +357,6 @@ will be used.
 
 - `clean_run_cache: [Boolean]` This decides whether to clean the run cache of the pass before running the pass. This is `false` by default.
 
-- `output_name: str` In no-search mode (i.e., `search_strategy` is `null`), if `output_name` is provided, the output model of the pass will be
-saved to the engine's `output_dir` with the prefix of `output_name`. For the final pass, if the engine's `output_name` is provided, it will override
-the `output_name` of the pass.
-
 Please refer to [Configuring Pass](../tutorials/configure_pass.rst) for more details on `type`, `disable_search` and `config`.
 
 Please also find the detailed options from following table for each pass:

diff --git a/examples/directml/llm/config_llm.json b/examples/directml/llm/config_llm.json
@@ -97,6 +97,5 @@
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",
-    "output_name": "",
     "output_dir": "footprints"
 }
diff --git a/examples/directml/llm/llm.py b/examples/directml/llm/llm.py
@@ -160,7 +160,6 @@ def optimize(
         "cuda": ["CUDAExecutionProvider"],
     }[device]
 
-    olive_config["output_name"] = model_name
     olive_config["passes"]["optimize"]["hidden_size"] = config.hidden_size
     olive_config["passes"]["optimize"]["num_heads"] = config.num_heads
     olive_config["passes"]["optimize"]["num_key_value_heads"] = config.num_key_value_heads
@@ -224,7 +223,7 @@ def optimize(
 
     olive_run(olive_config)
 
-    footprints_file_path = Path(__file__).resolve().parent / "footprints" / f"{model_name}_gpu-{device}_footprints.json"
+    footprints_file_path = Path(__file__).resolve().parent / "footprints" / "footprints.json"
     with footprints_file_path.open("r") as footprint_file:
         footprints = json.load(footprint_file)
 

diff --git a/examples/directml/stable_diffusion_xl/config_text_encoder.json b/examples/directml/stable_diffusion_xl/config_text_encoder.json
@@ -118,6 +118,5 @@
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",
-    "output_name": "text_encoder",
-    "output_dir": "footprints"
+    "output_dir": "footprints/text_encoder"
 }
diff --git a/examples/directml/stable_diffusion_xl/config_text_encoder_2.json b/examples/directml/stable_diffusion_xl/config_text_encoder_2.json
@@ -158,6 +158,5 @@
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",
-    "output_name": "text_encoder_2",
-    "output_dir": "footprints"
+    "output_dir": "footprints/text_encoder_2"
 }
diff --git a/examples/directml/stable_diffusion_xl/config_unet.json b/examples/directml/stable_diffusion_xl/config_unet.json
@@ -102,6 +102,5 @@
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",
-    "output_name": "unet",
-    "output_dir": "footprints"
+    "output_dir": "footprints/unet"
 }
diff --git a/examples/directml/stable_diffusion_xl/config_vae_decoder.json b/examples/directml/stable_diffusion_xl/config_vae_decoder.json
@@ -114,6 +114,5 @@
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",
-    "output_name": "vae_decoder",
-    "output_dir": "footprints"
+    "output_dir": "footprints/vae_decoder"
 }
diff --git a/examples/directml/stable_diffusion_xl/config_vae_encoder.json b/examples/directml/stable_diffusion_xl/config_vae_encoder.json
@@ -93,6 +93,5 @@
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",
-    "output_name": "vae_encoder",
-    "output_dir": "footprints"
+    "output_dir": "footprints/vae_encoder"
 }
diff --git a/examples/directml/stable_diffusion_xl/stable_diffusion_xl.py b/examples/directml/stable_diffusion_xl/stable_diffusion_xl.py
@@ -366,9 +366,7 @@ def optimize(
 
         olive_run(olive_config)
 
-        footprints_file_path = (
-            Path(__file__).resolve().parent / "footprints" / f"{submodel_name}_gpu-{provider}_footprints.json"
-        )
+        footprints_file_path = Path(__file__).resolve().parent / "footprints" / "footprints.json"
         with footprints_file_path.open("r") as footprint_file:
             footprints = json.load(footprint_file)
 

diff --git a/examples/inception/inception_config.json b/examples/inception/inception_config.json
@@ -73,6 +73,5 @@
     "host": "local_system",
     "evaluator": "common_evaluator",
     "cache_dir": "cache",
-    "output_dir": "outputs",
-    "output_name": "snpe_quantized"
+    "output_dir": "outputs"
 }
diff --git a/examples/llama2/llama2_multilora.ipynb b/examples/llama2/llama2_multilora.ipynb
@@ -64,10 +64,9 @@
    "source": [
     "## Workflow\n",
     "\n",
-    "Olive provides a command line tool to run a lora/qlora fine-tuning workflow.\n",
-    "\n",
-    "It performs the optimization pipeline:\n",
-    "- GPU, FP16: *Pytorch Model -> Fine-tuned Pytorch Model -> Onnx Model -> Transformers Optimized Onnx Model fp16 -> Extract Adapters*"
+    "Olive provides a command line tools to run a lora/qlora fine-tuning workflow. This workflow includes the following steps:\n",
+    "- `finetune`: Fine-tune a model using LoRA or QLoRA.\n",
+    "- `generate-adapter`: Export the fine-tuned model to ONNX, optimize it and extract the adapters as model inputs."
    ]
   },
   {
@@ -76,15 +75,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# run this cell to see the available options to finetune command\n",
-    "!olive finetune --help"
+    "# run this cell to see the available options to finetune and generate-adapter commands\n",
+    "!olive finetune --help\n",
+    "!olive generate-adapter --help"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let us now fine tune the llama2 model using QLoRA on [nampdn-ai/tiny-codes](https://huggingface.co/datasets/nampdn-ai/tiny-codes) to generate python code given a langauge and prompt."
+    "First, fine tune the llama2 model using QLoRA on [nampdn-ai/tiny-codes](https://huggingface.co/datasets/nampdn-ai/tiny-codes) to generate python code given a language and prompt."
    ]
   },
   {
@@ -97,17 +97,33 @@
     "    -m meta-llama/Llama-2-7b-hf -d nampdn-ai/tiny-codes \\\n",
     "    --train_split \"train[:4096]\" --eval_split \"train[4096:4224]\" \\\n",
     "    --text_template \"### Language: {programming_language} \\n### Question: {prompt} \\n### Answer: {response}\" \\\n",
-    "    --per_device_train_batch_size 16 --per_device_eval_batch_size 16 --max_steps 150 --logging_steps 50 \\\n",
-    "    -o models/tiny-codes --use_ort_genai"
+    "    --per_device_train_batch_size 16 --per_device_eval_batch_size 16 --max_steps 15 --logging_steps 5 \\\n",
+    "    -o models/tiny-codes/fine-tune"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, let's generate the optimized onnx model with adapters as inputs. We can use the output of the previous step as input to this step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!olive generate-adapter -m models/tiny-codes/fine-tune --use_ort_genai -o models/tiny-codes/optimized"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "The output model files are can be found at:\n",
-    "- Model: `models/tiny-codes/model.onnx`\n",
-    "- Adapter weights: `models/tiny-codes/adapter_weights.npz`"
+    "- Model: `models/tiny-codes/optimized/model/model.onnx`\n",
+    "- Adapter weights: `models/tiny-codes/optimized/model/adapter_weights.npz`"
    ]
   },
   {
@@ -156,14 +172,14 @@
    "outputs": [],
    "source": [
     "base_model_name = \"meta-llama/llama-2-7b-hf\"\n",
-    "model_path = \"models/tiny-codes/model.onnx\"\n",
+    "model_path = \"models/tiny-codes/optimized/model/model.onnx\"\n",
     "adapters = {\n",
     "    \"guanaco\": {\n",
     "        \"weights\": \"models/exported/guanaco_qlora.npz\",\n",
     "        \"template\": \"### Human: {prompt} ### Assistant:\"\n",
     "    },\n",
     "    \"tiny-codes\": {\n",
-    "        \"weights\": \"models/tiny-codes/adapter_weights.npz\",\n",
+    "        \"weights\": \"models/tiny-codes/optimized/model/adapter_weights.npz\",\n",
     "        \"template\": \"### Language: {prompt_0} \\n### Question: {prompt_1} \\n### Answer: \"\n",
     "    }\n",
     "}"

diff --git a/examples/llama2/llama2_template.json b/examples/llama2/llama2_template.json
@@ -118,6 +118,7 @@
         "gptq_quant_int4": { "type": "GptqQuantizer", "data_config": "wikitext2_train" }
     },
     "evaluator": "merged_evaluator",
+    "evaluate_input_model": false,
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",

diff --git a/examples/mistral/mistral_fp16_optimize.json b/examples/mistral/mistral_fp16_optimize.json
@@ -50,6 +50,5 @@
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",
-    "output_dir": "models",
-    "output_name": "mistral_fp16"
+    "output_dir": "models/mistral_fp16"
 }
diff --git a/examples/mistral/mistral_int4_optimize.json b/examples/mistral/mistral_int4_optimize.json
@@ -63,6 +63,5 @@
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",
-    "output_dir": "models",
-    "output_name": "mistral_int4"
+    "output_dir": "models/mistral_int4"
 }
diff --git a/examples/open_llama/open_llama_arc.json b/examples/open_llama/open_llama_arc.json
@@ -58,7 +58,6 @@
     },
     "evaluator": "common_evaluator",
     "cache_dir": "cache",
-    "output_name": "ollama",
     "target": "azure_arc",
     "host": "aml",
     "output_dir": "models/open_llama_arc"

diff --git a/examples/open_llama/open_llama_config.json b/examples/open_llama/open_llama_config.json
@@ -39,7 +39,6 @@
     },
     "evaluator": "common_evaluator",
     "cache_dir": "cache",
-    "output_name": "ollama",
     "host": "local_system",
     "target": "local_system",
     "output_dir": "models/open_llama"

diff --git a/examples/open_llama/open_llama_inc_woq.json b/examples/open_llama/open_llama_inc_woq.json
@@ -54,6 +54,5 @@
     },
     "evaluator": "common_evaluator",
     "cache_dir": "cache",
-    "output_name": "ollama",
     "output_dir": "models/open_llama_inc_woq"
 }
diff --git a/examples/phi2/phi2_optimize_template.json b/examples/phi2/phi2_optimize_template.json
@@ -148,7 +148,6 @@
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",
-    "output_name": "phi2",
     "output_dir": "phi2",
     "clean_cache": false,
     "log_severity_level": 0,

diff --git a/examples/phi3/.gitignore b/examples/phi3/.gitignore
@@ -1,3 +1 @@
-phi3_cpu*.json
-phi3_gpu*.json
-phi3_quarot.json
+phi3_run_*.json
diff --git a/examples/phi3/README.md b/examples/phi3/README.md
@@ -72,7 +72,7 @@ If you have an Olive configuration file, you can also run the olive command for
 olive run [--config CONFIGURATION_FILE]
 
 # Examples
-olive run --config phi3_mobile_int4.json
+olive run --config phi3_run_mobile_int4.json
 ```
 
 We also introduce QuaRot, a new Quantization scheme based on Rotations, which is able to quantize LLMs end-to-end.

diff --git a/examples/phi3/phi3.py b/examples/phi3/phi3.py
@@ -11,8 +11,8 @@
 
 import onnxruntime_genai as og
 
-from olive.common.utils import hardlink_copy_dir, unescaped_str
-from olive.hardware import AcceleratorSpec
+from olive.cli.base import save_output_model
+from olive.common.utils import unescaped_str
 from olive.workflows import run as olive_run
 
 # flake8: noqa: T201
@@ -164,20 +164,7 @@ def main(raw_args=None):
         if args.quarot:
             return
 
-        # need to improve the output structure of olive run
-        output_path.mkdir(parents=True, exist_ok=True)
-        accelerator = run_config["systems"]["local_system"]["accelerators"][0]
-        accelerator_str = str(
-            AcceleratorSpec(
-                accelerator_type=accelerator["device"], execution_provider=accelerator["execution_providers"][0]
-            )
-        )
-        hardlink_copy_dir(
-            Path(tempdir) / "-".join(run_config["passes"].keys()) / f"{accelerator_str}_model",
-            output_path,
-        )
-
-        print("\nOptimized model is generated in", args.output_dir)
+        save_output_model(run_config, output_path)
 
     if args.inference:
         if not args.chat_template:
@@ -193,7 +180,7 @@ def main(raw_args=None):
 
         max_length = 200 if not args.max_length else args.max_length
 
-        genai_run(prompts, str(output_path), max_length)
+        genai_run(prompts, str(output_path / "model"), max_length)
 
 
 def use_passes(template_json, *passes):
@@ -226,12 +213,14 @@ def generate_config(args):
     with open(json_file_template) as f:
         template_json = json.load(f)
 
+    config_prefix = "phi3_run_"
+
     if args.quarot:
         template_json = use_passes(template_json, "quarot")
         template_json["systems"]["local_system"]["accelerators"] = [
             {"device": "GPU", "execution_providers": ["CUDAExecutionProvider"]}
         ]
-        new_json_file = "phi3_quarot.json"
+        new_json_file = f"{config_prefix}quarot.json"
         with open(new_json_file, "w") as f:
             json.dump(template_json, f, indent=4)
 
@@ -277,7 +266,7 @@ def generate_config(args):
     # set cache dir
     template_json["cache_dir"] = args.cache_dir
 
-    new_json_file = f"phi3_{target.lower()}_{args.precision}.json"
+    new_json_file = f"{config_prefix}{target.lower()}_{args.precision}.json"
     with open(new_json_file, "w") as f:
         json.dump(template_json, f, indent=4)
 

diff --git a/examples/phi3/phi3_vision.py b/examples/phi3/phi3_vision.py
@@ -42,9 +42,11 @@ def get_args(raw_args):
         default="int4",
         choices=["int4", "fp16"],
         help=(
-            "Precision of optimized model. "
-            "int4: run quantization on the model, which is able to run on CPU and CUDA."
-            "fp16: no quantization, only run on CUDA.",
+            (
+                "Precision of optimized model. "
+                "int4: run quantization on the model, which is able to run on CPU and CUDA."
+                "fp16: no quantization, only run on CUDA."
+            ),
         ),
     )
     parser.add_argument(
@@ -60,7 +62,7 @@ def get_args(raw_args):
     parser.add_argument(
         "--output_dir",
         type=str,
-        default="cache/phi3-vision-128k-instruct",
+        default="models/phi3-vision-128k-instruct",
         required=False,
         help="Path to folder to store ONNX model and additional files (e.g. GenAI config, external data files, etc.)",
     )
@@ -126,7 +128,7 @@ def main(raw_args=None):
         generate(args.optimized_model_path)
         return
 
-    input_model_path = output_dir / "phi3-vision-128k-instruct" / "pytorch"
+    input_model_path = output_dir / "pytorch"
     if not is_model_ready(input_model_path):
         print(f"Model not found from {input_model_path}, preparing the model...")
         # prepare the input model
@@ -180,7 +182,7 @@ def main(raw_args=None):
     to_remove_folders = [
         Path(args.output_dir).resolve() / "vision",
         Path(args.output_dir).resolve() / "text",
-        Path(args.output_dir).resolve() / "text-embedding",
+        Path(args.output_dir).resolve() / "text_embedding",
     ]
     for folder in to_remove_folders:
         shutil.rmtree(folder, ignore_errors=True)

diff --git a/examples/phi3/vision/scripts/prepare_phi3_vision_for_olive.sh b/examples/phi3/vision/scripts/prepare_phi3_vision_for_olive.sh
@@ -8,7 +8,7 @@ then
     echo "Usage: prepare_phi3_vision_for_olive.sh <output_dir>"
     exit 1
 else
-    base_output_dir="$1"/phi3-vision-128k-instruct
+    base_output_dir="$1"
     pytorch_output_dir="$base_output_dir"/pytorch
 fi
 

diff --git a/examples/stable_diffusion/config_safety_checker.json b/examples/stable_diffusion/config_safety_checker.json
@@ -95,6 +95,5 @@
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",
-    "output_name": "safety_checker",
-    "output_dir": "footprints"
+    "output_dir": "footprints/safety_checker"
 }
diff --git a/examples/stable_diffusion/config_text_encoder.json b/examples/stable_diffusion/config_text_encoder.json
@@ -92,6 +92,5 @@
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",
-    "output_name": "text_encoder",
-    "output_dir": "footprints"
+    "output_dir": "footprints/text_encoder"
 }
diff --git a/examples/stable_diffusion/config_unet.json b/examples/stable_diffusion/config_unet.json
@@ -107,6 +107,5 @@
     "host": "local_system",
     "target": "local_system",
     "cache_dir": "cache",
-    "output_name": "unet",
-    "output_dir": "footprints"
+    "output_dir": "footprints/unet"
 }