Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Very slow inference of optimized whisper gpu #1300

Open
thewh1teagle opened this issue Aug 10, 2024 · 0 comments
Open

Very slow inference of optimized whisper gpu #1300

thewh1teagle opened this issue Aug 10, 2024 · 0 comments

Comments

@thewh1teagle
Copy link

thewh1teagle commented Aug 10, 2024

Describe the bug
I optimized whisper medium int8 on Ubuntu server with 64 cores, rtx3070 and 128GB ram.
When running test_transcription it took a minute.

To Reproduce
Optimize whisper medium int8 on cuda 11 and run test_transcription

Expected behavior
It should run faster.

Olive config

{
    "input_model": {
        "type": "CompositeModel",
        "model_component_names": [
            "encoder_decoder_init",
            "decoder"
        ],
        "model_components": [
            {
                "type": "PyTorchModel",
                "model_path": "openai/whisper-medium",
                "model_script": "code/user_script.py",
                "script_dir": "code",
                "model_loader": "get_encoder_decoder_init",
                "io_config": "get_encdec_io_config",
                "dummy_inputs_func": "encoder_decoder_init_dummy_inputs"
            },
            {
                "type": "PyTorchModel",
                "model_path": "openai/whisper-medium",
                "model_script": "code/user_script.py",
                "script_dir": "code",
                "model_loader": "get_decoder",
                "io_config": "get_dec_io_config",
                "dummy_inputs_func": "decoder_dummy_inputs"
            }
        ],
        "model_attributes": {
            "vocab_size": 51865,
            "num_mel_bins": 80,
            "d_model": 1024,
            "encoder_layers": 24,
            "encoder_attention_heads": 16,
            "decoder_layers": 24,
            "decoder_attention_heads": 16,
            "decoder_ffn_dim": 4096,
            "encoder_ffn_dim": 4096,
            "dropout": 0.0,
            "attention_dropout": 0.0,
            "activation_dropout": 0.0,
            "activation_function": "gelu",
            "init_std": 0.02,
            "encoder_layerdrop": 0.0,
            "decoder_layerdrop": 0.0,
            "use_cache": true,
            "num_hidden_layers": 24,
            "scale_embedding": false,
            "max_source_positions": 1500,
            "max_target_positions": 448,
            "classifier_proj_size": 256,
            "use_weighted_layer_sum": false,
            "apply_spec_augment": false,
            "mask_time_prob": 0.05,
            "mask_time_length": 10,
            "mask_time_min_masks": 2,
            "mask_feature_prob": 0.0,
            "mask_feature_length": 10,
            "mask_feature_min_masks": 0,
            "median_filter_width": 7,
            "return_dict": true,
            "output_hidden_states": false,
            "output_attentions": false,
            "torchscript": false,
            "torch_dtype": "float32",
            "use_bfloat16": false,
            "tf_legacy_loss": false,
            "pruned_heads": {},
            "tie_word_embeddings": true,
            "chunk_size_feed_forward": 0,
            "is_encoder_decoder": true,
            "is_decoder": false,
            "cross_attention_hidden_size": null,
            "add_cross_attention": false,
            "tie_encoder_decoder": false,
            "max_length": 448,
            "min_length": 0,
            "do_sample": false,
            "early_stopping": false,
            "num_beams": 1,
            "num_beam_groups": 1,
            "diversity_penalty": 0.0,
            "temperature": 1.0,
            "top_k": 50,
            "top_p": 1.0,
            "typical_p": 1.0,
            "repetition_penalty": 1.0,
            "length_penalty": 1.0,
            "no_repeat_ngram_size": 0,
            "encoder_no_repeat_ngram_size": 0,
            "bad_words_ids": null,
            "num_return_sequences": 1,
            "output_scores": false,
            "return_dict_in_generate": false,
            "forced_bos_token_id": null,
            "forced_eos_token_id": null,
            "remove_invalid_values": false,
            "exponential_decay_length_penalty": null,
            "begin_suppress_tokens": [
                220,
                50257
            ],
            "architectures": [
                "WhisperForConditionalGeneration"
            ],
            "finetuning_task": null,
            "id2label": {
                "0": "LABEL_0",
                "1": "LABEL_1"
            },
            "label2id": {
                "LABEL_0": 0,
                "LABEL_1": 1
            },
            "tokenizer_class": null,
            "prefix": null,
            "bos_token_id": 50257,
            "pad_token_id": 50257,
            "eos_token_id": 50257,
            "sep_token_id": null,
            "decoder_start_token_id": 50258,
            "task_specific_params": null,
            "problem_type": null,
            "_name_or_path": "openai/whisper-medium",
            "transformers_version": "4.42.4",
            "forced_decoder_ids": [
                [
                    1,
                    50259
                ],
                [
                    2,
                    50359
                ],
                [
                    3,
                    50363
                ]
            ],
            "model_type": "whisper"
        }
    },
    "systems": {
        "local_system": {
            "type": "LocalSystem",
            "accelerators": [
                {
                    "device": "gpu",
                    "execution_providers": [
                        "CUDAExecutionProvider"
                    ]
                }
            ]
        }
    },
    "data_configs": [
        {
            "name": "latency_data_config",
            "user_script": "code/user_script.py",
            "script_dir": "code",
            "load_dataset_config": {
                "type": "whisper_dataset",
                "data_dir": "data",
                "model_name": "openai/whisper-medium",
                "use_audio_decoder": true
            },
            "dataloader_config": {
                "type": "no_auto_batch_dataloader"
            }
        }
    ],
    "evaluators": {
        "common_evaluator": {
            "metrics": [
                {
                    "name": "latency",
                    "type": "latency",
                    "sub_types": [
                        {
                            "name": "avg",
                            "priority": 1
                        }
                    ],
                    "data_config": "latency_data_config"
                }
            ]
        }
    },
    "passes": {
        "conversion": {
            "type": "OnnxConversion",
            "target_opset": 17
        },
        "transformers_optimization": {
            "type": "OrtTransformersOptimization",
            "optimization_options": {
                "use_multi_head_attention": true
            },
            "use_gpu": true
        },
        "onnx_dynamic_quantization": {
            "type": "OnnxDynamicQuantization",
            "per_channel": false,
            "reduce_range": false,
            "op_types_to_quantize": [
                "MatMul",
                "Gemm",
                "Gather"
            ],
            "MatMulConstBOnly": false
        },
        "insert_beam_search": {
            "type": "InsertBeamSearch",
            "use_forced_decoder_ids": true,
            "use_logits_processor": false,
            "fp16": false
        },
        "prepost": {
            "type": "AppendPrePostProcessingOps",
            "tool_command": "whisper",
            "tool_command_args": {
                "model_name": "openai/whisper-medium",
                "use_audio_decoder": true
            },
            "target_opset": 17
        }
    },
    "log_severity_level": 0,
    "host": "local_system",
    "target": "local_system",
    "evaluator": "common_evaluator",
    "evaluate_input_model": false,
    "clean_cache": false,
    "cache_dir": "cache",
    "output_dir": "models",
    "output_name": "whisper_gpu_int8"
}

Olive logs
Add logs here.

Other information

  • OS: Linux
  • Olive version: 0.4.0
  • ONNXRuntime package and version: onnxruntime-gpu==1.18.1

Additional context
https://huggingface.co/thewh1teagle/whisper-olive/tree/main

At this stage, it might be helpful to add some notes to the main README of Whisper about its current capabilities. It doesn't seem completely stable yet, and I had to run several tests to get a clear understanding. Thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant